예제 #1
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size

        # Exploration parameters
        self.decay_max   = 1.0              # exploration probability at start
        self.decay_min   = 0.01             # minimum exploration probability 
        self.decay_rate  = 0.0001           # exponential decay rate for exploration prob
        self.decay_step  = np.exp(-self.decay_rate)
        self.decay_range = self.decay_max - self.decay_min
        self.decay_factor = 1.
        self.explore_p = self.decay_max

        # Network parameters
        self.learning_rate = 0.0001         # Q-network learning rate
        #self.learning_rate = 0.001         # Q-network learning rate

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor

        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = -np.inf
        self.loss = 0

        self.qnet = QNetwork(self.state_size, self.action_size, name='main', learning_rate=self.learning_rate)

        # Episode variables
        self.reset_episode()
예제 #2
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.90  # discount factor
        self.tau = 0.005  # for soft update of target parameters
예제 #3
0
    def __init__(self, cfg):
        # Environment configuration
        self.action_shape = cfg['env']['action_shape']

        # Replay memory
        cfg['agent']['memory']['action_shape'] = self.action_shape
        self.memory = ReplayBuffer(**cfg['agent']['memory'])

        # Algorithm parameters
        self.exploration_mu, self.exploration_sigma = cfg['agent']['noise']

        self.gamma = cfg['agent']['gamma']
        self.tau = cfg['agent']['tau']

        state_flatten_shape = [np.prod(self.memory.flatten_state_shape)]
        # Actor Model
        self.actor = Actor(state_flatten_shape, self.action_shape,
                           cfg['env']['action_range'], self.tau,
                           self.memory.batch_size, cfg['actor'])

        # Critic Model
        self.critic = Critic(state_flatten_shape, self.action_shape, self.tau,
                             cfg['critic'])

        # Flag & Counter
        self.training = True
        self.episode = 0
        self.max_episode_explore = cfg['agent']['explore']
예제 #4
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2

        #self.exploration_mu = 0.0001
        #self.exploration_theta = 0.2
        #self.exploration_sigma = 0.25

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
        #self.tau = .005

        self.closeCount = 0

        # Score tracker #####################
        self.best_score = -np.inf
        self.total_reward = 0.0

        self.currentclose = 0
예제 #5
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = task.action_high - task.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.001
        self.exploration_sigma = 1
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)
        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.05  # for soft update of target parameters

        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = -np.inf

        # Episode variables
        self.reset_episode()
        self.noise.reset()
        self.noise.state = [100.] * self.action_size
예제 #6
0
    def __init__(self, task, name, loadfile=False):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.name = name
        if loadfile:
            self.actor_local.model.load_weights("./weights/" + name +
                                                "_actor.h5")
            self.critic_local.model.load_weights("./weights/" + name +
                                                 "_critic.h5")

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.3 #original 0.15
        self.exploration_sigma = 0.3  #0.3 #original 0.3

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.085
        # self.exploration_sigma = 0.15
        self.exploration_theta = 0.070
        self.exploration_sigma = 0.20
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.70  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #8
0
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""

    #name: is a name to use to save the netural Network models
    #load: load data from existing models or cretae an entirly new model
    def __init__(self, task, name, loadfile=False):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.name = name
        if loadfile:
            self.actor_local.model.load_weights("./weights/" + name +
                                                "_actor.h5")
            self.critic_local.model.load_weights("./weights/" + name +
                                                 "_critic.h5")

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.3 #original 0.15
        self.exploration_sigma = 0.3  #0.3 #original 0.3

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)

        #rewards = np.interp(rewards, (rewards.min(), rewards.max()), (-1, +1)) #TESTING to scale rewards to a small number.

        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def save_weights(self):
        self.actor_local.model.save_weights("./weights/" + self.name +
                                            "_actor.h5")
        self.critic_local.model.save_weights("./weights/" + self.name +
                                             "_critic.h5")

    #Notice that after training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model.
    #However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau.

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
예제 #9
0
class DQLagent():
    """Reinforcement Learning agent that learns using a DQL network."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size

        # Exploration parameters
        self.decay_max   = 1.0              # exploration probability at start
        self.decay_min   = 0.01             # minimum exploration probability 
        self.decay_rate  = 0.0001           # exponential decay rate for exploration prob
        self.decay_step  = np.exp(-self.decay_rate)
        self.decay_range = self.decay_max - self.decay_min
        self.decay_factor = 1.
        self.explore_p = self.decay_max

        # Network parameters
        self.learning_rate = 0.0001         # Q-network learning rate
        #self.learning_rate = 0.001         # Q-network learning rate

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor

        # Score tracker and learning parameters
        self.best_score = -np.inf
        self.score = -np.inf
        self.loss = 0

        self.qnet = QNetwork(self.state_size, self.action_size, name='main', learning_rate=self.learning_rate)

        # Episode variables
        self.reset_episode()
        
    def reset_episode(self,new_tgt_pos=None):
        self.total_reward = 0.0
        self.count = 0
        state = self.task.reset(new_tgt_pos)
        self.last_state = state
        return state

    def act(self, tfsess, state):
        """Returns actions for given state(s) as per current policy."""
        # Explore or Exploit
        if len(self.memory) > self.batch_size:
            # epsilon-greedy policy:
            self.decay_factor *= self.decay_step
            self.explore_p = self.decay_min + (self.decay_range*self.decay_factor) 
            if self.explore_p > np.random.rand():
                # Make a random action
                actions = np.random.randint(0,self.action_size)
            else:
                # Get actions from Q-network
                feed = {self.qnet.inputs_: state.reshape((1, *state.shape))}
                Qs = tfsess.run(self.qnet.output, feed_dict=feed)
                actions = np.argmax(Qs)
        else:
            # pick actions equi-probablistically
            actions = np.random.randint(0,self.action_size)
        return actions

    def step(self,
             tfsess,
             action,     # int
             reward,     # np.ndarray (action_repeat,)
             next_state, # np.ndarray (state_size*action_repeat,)
             done):      # bool
         # Save experience / reward
        self.memory.add(self.last_state, action, np.mean(reward), next_state, done)

        # Save experience / reward
        self.total_reward += np.mean(reward)
        self.count += 1

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(tfsess,experiences)

        # Roll over last state and action
        self.last_state = next_state

    def learn(self, tfsess, expbatch):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states      = np.array([e.state for e in expbatch if e is not None])
        actions     = np.array([e.action for e in expbatch if e is not None]).astype(np.float32)
        rewards     = np.array([e.reward for e in expbatch if e is not None]).astype(np.float32)
        dones       = np.array([e.done for e in expbatch if e is not None]).astype(np.uint8)
        next_states = np.array([e.next_state for e in expbatch if e is not None])

        # Train network
        target_Qs = tfsess.run(self.qnet.output, feed_dict={self.qnet.inputs_: next_states})
            
        # Set target_Qs to 0 for states where episode ends
        target_Qs[dones] = np.zeros(self.action_size)
        
        targets = rewards + self.gamma * np.max(target_Qs, axis=1)

        self.loss, _ = tfsess.run([self.qnet.loss, self.qnet.opt],
                                   feed_dict={self.qnet.inputs_: states,
                                              self.qnet.targetQs_: targets,
                                              self.qnet.actions_: actions})

        self.score = self.total_reward / float(self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score
예제 #10
0
class Agent():
    def __init__(self, cfg):
        # Environment configuration
        self.action_shape = cfg['env']['action_shape']

        # Replay memory
        cfg['agent']['memory']['action_shape'] = self.action_shape
        self.memory = ReplayBuffer(**cfg['agent']['memory'])

        # Algorithm parameters
        self.exploration_mu, self.exploration_sigma = cfg['agent']['noise']

        self.gamma = cfg['agent']['gamma']
        self.tau = cfg['agent']['tau']

        state_flatten_shape = [np.prod(self.memory.flatten_state_shape)]
        # Actor Model
        self.actor = Actor(state_flatten_shape, self.action_shape,
                           cfg['env']['action_range'], self.tau,
                           self.memory.batch_size, cfg['actor'])

        # Critic Model
        self.critic = Critic(state_flatten_shape, self.action_shape, self.tau,
                             cfg['critic'])

        # Flag & Counter
        self.training = True
        self.episode = 0
        self.max_episode_explore = cfg['agent']['explore']

    def init_actor_critic(self):
        # Initialize target model
        self.critic.copy_local_in_target()
        self.actor.copy_local_in_target()

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done,
                        self.training)

    def act(self, state):
        self.last_state = state

        window_states = state.reshape(1, -1)
        action = self.actor.predict(window_states)

        if self.training and self.episode < self.max_episode_explore:
            p = self.episode / self.max_episode_explore
            action = p * action + (p - 1) * np.random.normal(
                self.exploration_mu, self.exploration_sigma)
        return np.clip(action.ravel(), a_max=900, a_min=0)

    def learn(self):
        if self.memory.is_sufficient():
            experiences = self.memory.sample()

            states = experiences['state'][:,
                                          0].reshape(self.memory.batch_size,
                                                     -1)
            actions = experiences['action'][:,
                                            0].reshape(self.memory.batch_size,
                                                       -1)
            rewards = experiences['reward']
            dones = experiences['done']
            next_states = experiences['next_state'][:, 0].reshape(
                self.memory.batch_size, -1)

            # get predicted next state action and Q values from target models
            actions_next = self.actor.get_targets(next_states)
            Q_targets_next = self.critic.get_targets(next_states, actions_next)

            # Compute Q targets for current states and train critic model
            Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
            self.critic.fit(states, actions, Q_targets)

            # Train actor model
            action_gradients = self.critic.get_actions_grad(states, actions)[0]
            self.actor.fit(states, action_gradients)

            # Soft-update target models
            self.critic.soft_update()
            self.actor.soft_update()