Esempio n. 1
0
    def __init__(self,
                 task,
                 exploration_mu=0,
                 exploration_theta=0.15,
                 exploration_sigma=0.2,
                 tau=0.01):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = exploration_mu
        self.exploration_theta = exploration_theta
        self.exploration_sigma = exploration_sigma
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 128  # 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.85  # 0.99  # discount factor
        self.tau = tau  # for soft update of target parameters

        ##
        self.total_reward = 0
        self.best_score = -np.inf
        self.score = 0
        self.count = 0
Esempio n. 2
0
def train(sess, env, args, actor, critic, actor_noise):
    """
    Agent Training
    :param sess:
    :param env:
    :param args:
    :param actor:
    :param critic:
    :param actor_noise:
    :return:
    """
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:

                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ep_reward,
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j)
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                        i, (ep_ave_max_q / float(j))))
                break
Esempio n. 3
0
class Agent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

        # Score tracker
        self.best_score = -np.inf
        self.score = -np.inf

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state

        self.total_reward = 0.0
        self.count = 0

        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        # Save experience / reward
        self.total_reward += reward
        self.count += 1

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

        self.score = self.total_reward / float(
            self.count) if self.count else 0.0
        if self.score > self.best_score:
            self.best_score = self.score

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
Esempio n. 4
0
    def __init__(self,
                 state_size,
                 action_size,
                 actor,
                 critic,
                 action_low=-1.0,
                 action_high=1.0,
                 lrate_critic=10e-3,
                 lrate_actor=10e-4,
                 tau=0.001,
                 buffer_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 exploration_mu=0.0,
                 exploration_theta=0.15,
                 noise_decay=1.,
                 exploration_sigma=0.20,
                 restore=None,
                 weight_decay=0.,
                 update_every=1,
                 update_repeat=1,
                 seed=None):
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.seed = seed if seed else np.random.randint(100)
        self.lrate_critic = lrate_critic
        self.lrate_actor = lrate_actor
        self.tau = tau
        self.gamma = gamma
        self.restore = restore
        self.batch_size = int(batch_size)
        self.buffer_size = int(buffer_size)
        self.update_every = update_every
        self.device = torch.device(DEVICE)
        self.weight_decay = weight_decay
        self.update_repeat = update_repeat
        self.noise_decay = noise_decay

        # actors networks
        self.actor = actor(state_size,
                           action_size,
                           low=action_low,
                           high=action_high,
                           seed=self.seed)
        self.actor_target = actor(state_size,
                                  action_size,
                                  low=action_low,
                                  high=action_high,
                                  seed=self.seed)

        # critic networks
        self.critic = critic(state_size, action_size, seed=self.seed)
        self.critic_target = critic(state_size, action_size, seed=self.seed)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=DEVICE)
            self.actor.load_state_dict(checkpoint['actor'])
            self.actor_target.load_state_dict(checkpoint['actor'])
            self.critic.load_state_dict(checkpoint['critic'])
            self.critic_target.load_state_dict(checkpoint['critic'])

        # optimizer
        self.actor_opt = optim.Adam(self.actor.parameters(),
                                    lr=lrate_actor,
                                    weight_decay=self.weight_decay)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=lrate_critic,
                                     weight_decay=self.weight_decay)

        # noise
        self.noise = OUNoise(action_size, exploration_mu, exploration_theta,
                             exploration_sigma)
        self.noise_scale = 1.0

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.device, self.buffer_size,
                                          self.batch_size)

        # reset agent for training
        self.reset_episode()
        self.it = 0
Esempio n. 5
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 actor,
                 critic,
                 action_low=-1.0,
                 action_high=1.0,
                 lrate_critic=10e-3,
                 lrate_actor=10e-4,
                 tau=0.001,
                 buffer_size=1e5,
                 batch_size=64,
                 gamma=0.99,
                 exploration_mu=0.0,
                 exploration_theta=0.15,
                 noise_decay=1.,
                 exploration_sigma=0.20,
                 restore=None,
                 weight_decay=0.,
                 update_every=1,
                 update_repeat=1,
                 seed=None):
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high
        self.seed = seed if seed else np.random.randint(100)
        self.lrate_critic = lrate_critic
        self.lrate_actor = lrate_actor
        self.tau = tau
        self.gamma = gamma
        self.restore = restore
        self.batch_size = int(batch_size)
        self.buffer_size = int(buffer_size)
        self.update_every = update_every
        self.device = torch.device(DEVICE)
        self.weight_decay = weight_decay
        self.update_repeat = update_repeat
        self.noise_decay = noise_decay

        # actors networks
        self.actor = actor(state_size,
                           action_size,
                           low=action_low,
                           high=action_high,
                           seed=self.seed)
        self.actor_target = actor(state_size,
                                  action_size,
                                  low=action_low,
                                  high=action_high,
                                  seed=self.seed)

        # critic networks
        self.critic = critic(state_size, action_size, seed=self.seed)
        self.critic_target = critic(state_size, action_size, seed=self.seed)

        # restore networks if needed
        if restore is not None:
            checkpoint = torch.load(restore, map_location=DEVICE)
            self.actor.load_state_dict(checkpoint['actor'])
            self.actor_target.load_state_dict(checkpoint['actor'])
            self.critic.load_state_dict(checkpoint['critic'])
            self.critic_target.load_state_dict(checkpoint['critic'])

        # optimizer
        self.actor_opt = optim.Adam(self.actor.parameters(),
                                    lr=lrate_actor,
                                    weight_decay=self.weight_decay)
        self.critic_opt = optim.Adam(self.critic.parameters(),
                                     lr=lrate_critic,
                                     weight_decay=self.weight_decay)

        # noise
        self.noise = OUNoise(action_size, exploration_mu, exploration_theta,
                             exploration_sigma)
        self.noise_scale = 1.0

        # replay buffer
        self.replay_buffer = ReplayBuffer(self.device, self.buffer_size,
                                          self.batch_size)

        # reset agent for training
        self.reset_episode()
        self.it = 0

    def reset_episode(self):
        self.noise.reset()

    def act(self, state, learn=True):
        if not learn:
            self.actor.eval()

        with torch.no_grad():
            action = self.actor(self.tensor(state)).cpu().numpy()
        if learn:
            action += self.noise.sample() * self.noise_scale
            self.noise_scale = max(self.noise_scale * self.noise_decay, 0.01)

        self.actor.train()
        return np.clip(action, self.action_low, self.action_high)

    def save(self, path):
        dirn = os.path.dirname(path)
        if not os.path.exists(dirn):
            os.mkdir(dirn)
        params = {}
        params['actor'] = self.actor.state_dict()
        params['critic'] = self.critic.state_dict()
        torch.save(params, path)

    def step(self, state, action, reward, next_state, done):
        #pylint: disable=line-too-long
        self.replay_buffer.add(state, action, reward, next_state, done)
        self.it += 1
        if self.it < self.batch_size or self.it % self.update_every != 0:
            return
        for _ in range(self.update_repeat):
            self.learn()

    def learn(self):
        # learn from mini-batch of replay buffer
        state_b, action_b, reward_b, next_state_b, done_b = self.replay_buffer.sample(
        )

        # calculate td target
        with torch.no_grad():
            y_b = reward_b.unsqueeze(1) + self.gamma * \
             self.critic_target(next_state_b, self.actor_target(next_state_b)) * (1-done_b.unsqueeze(1))

        # update critic
        critic_loss = F.smooth_l1_loss(self.critic(state_b, action_b), y_b)
        self.critic.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

        # update actor
        action = self.actor(state_b)
        actor_loss = -self.critic(state_b, action).mean()
        self.actor.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # soft update networks
        # critic only if trained
        # actor always
        self.soft_update()

    def soft_update(self):
        """Soft update of target network
        θ_target = τ*θ_local + (1 - τ)*θ_target
        """
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(self.tau * param.data +
                                    (1 - self.tau) * target_param.data)

    def tensor(self, x):
        return torch.from_numpy(x).float().to(self.device)
    def train(self):

        writer = tf.summary.FileWriter(self.summary_path, self.sess.graph)

        self.actor.update_target_network()
        self.critic.update_target_network()

        num_episode = self.config['episode']
        batch_size = self.config['batch_size']
        gamma = self.config['gamma']
        self.buffer = ReplayBuffer(self.config['buffer_size'], np.random.seed(self.config['seed']))
        
        reward_set = []
        q_value_set = []
        loss_set = []

        for i in range(num_episode):
            info = self.env.reset()
            obs1, time_window, done = self.env.provider.f_step()

            ep_reward = 0
            ep_ave_max_q = 0
            ep_loss = 0
            for j in range(self.config['steps']):

                action0_ = info["weight"]
                action0 = np.expand_dims(action0_, axis=0)
                state1 = np.expand_dims(obs1, axis=0)
                norm_state1 = normalize_state(state1)

                action = self.actor.predict(input_num = state1.shape[0],
                                            state = norm_state1,
                                            previous_action=action0) + self.actor_noise()
                # step forward
                reward, info, done = self.env.f_step(obs1, action[0])
                state2 = np.expand_dims(info["obs2"], axis=0)
                norm_state2 = normalize_state(state2)

                # add to buffer: the normalize the price
                self.buffer.add(norm_state1, action, reward, done, action0, norm_state2)

                obs1 = info["obs2"]
                ep_reward += reward

                #if self.buffer.size() >= batch_size:
                if True:
                    # batch update
                    s_batch, a_batch, r_batch, t_batch, a0_batch, s2_batch = self.buffer.sample_batch(batch_size)

                    # Calculate targets
                    input_num = s2_batch.shape[0]
                    target_action = self.actor.predict_target(input_num, s2_batch, a_batch)
                    target_q = self.critic.predict_target(input_num, s2_batch, a_batch, target_action)

                    y_i = []
                    for k in range(input_num):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + gamma * target_q[k])

                    # Update the critic given the targets
                    predicted_q_value = np.reshape(y_i, (input_num, 1))
                    critic_loss,q_value = self.critic.train(input_num, s_batch, a0_batch, a_batch, predicted_q_value)

                    ep_ave_max_q += np.amax(q_value)
                    ep_loss += critic_loss

                    # Update the actor policy using the sampled gradient
                    a_out = self.actor.predict(input_num, s_batch, a0_batch)
                    grads = self.critic.action_gradients(input_num, s_batch, a0_batch, a_out)
                    self.actor.train(input_num, s_batch, a0_batch, grads[0])

                    # Update target networks
                    self.actor.update_target_network()
                    self.critic.update_target_network()

                if done or j == self.config['steps'] - 1:
                    summary_str = self.sess.run(self.summary_ops,
                                                feed_dict={self.summary_vars[0]: ep_reward,
                                                           self.summary_vars[1]: ep_ave_max_q / float(j)})

                    writer.add_summary(summary_str, i)
                    writer.flush()
                    
                    reward_set.append(ep_reward)
                    q_value_set.append((ep_ave_max_q / float(j)))
                    loss_set.append(ep_loss/float(j) * 100)
                    print("------------------------------------------------------------------------------------------------")
                    print('Episode: {:d}, Reward: {:.4f}, Qmax: {:.4f}, loss: {:.4f}'.format(i, ep_reward, (ep_ave_max_q / float(j)), ep_loss/float(j) * 100))
                    break

        self.save_model()
        
        train_info = pd.DataFrame({"reward": reward_set, "q_value": q_value_set, "loss": loss_set})
        train_info.to_csv(self.train_info_path)
        print('Finish.')
        
        return train_info
Esempio n. 7
0
def train(sess,image_agent,continue_train=False):
    BUFFER_SIZE = 100000
    BATCH_SIZE = 128
    GAMMA = 0.9 
    TAU = 0.001 
    INIT_LRA = 0.000001
    INIT_LRC = 0.0001 
    EPISODE_MAX_STEP = 5000
    # DECAY_RATE = 0.5 
    # DECAY_STEP = 3000000
    #TOTAL_EPISODE = 30000
    TOTAL_EPISODE = 20000
    EXPLORE = 500000
    CURRENT_STEP=0
    actor = ActorNetwork(sess,BATCH_SIZE,TAU,INIT_LRA)
    critic = CriticNetwork(sess,BATCH_SIZE,TAU,INIT_LRC)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    sess.graph.finalize()
    ou = OU()
    # if continue_train:
    #     #TODO: reload network  and  params
    #     pass
    buffer_follow = ReplayBuffer(BUFFER_SIZE)
    buffer_straight = ReplayBuffer(BUFFER_SIZE)
    buffer_left = ReplayBuffer(BUFFER_SIZE)
    buffer_right = ReplayBuffer(BUFFER_SIZE)
    buffer_dict = {0:buffer_follow,1:buffer_left,2:buffer_right,3:buffer_straight}
   
    epsilon = 1.0

    env = Env("./log","./data",image_agent)
    #env.reset()
    
    for i in range(TOTAL_EPISODE):
        try:
            ob = env.reset()
        except Exception:
            continue
        total_reward = 0
        episode_step = 0
        s_t = ob
        for j in range(EPISODE_MAX_STEP):
            if s_t is None or len(s_t)<514:
                continue
            epsilon-=1.0/ EXPLORE
            image_input = s_t[0:-2]
            speed_input = s_t[-2:-1]
            #GO_STRAIGHT = 5.0,TURN_RIGHT = 4.0,TURN_LEFT = 3.0,LANE_FOLLOW = 2.0
            direction = s_t[-1:] 
            branch_st = int(direction-2)
            if branch_st == -2:  # REACH_GOAL=0
                break
            a_t=np.zeros([1,3]) #steer throttle brake 
            noise_t = np.zeros([1,3])    
            a_t_pridect = actor.pridect_action(image_input,speed_input,branch_st)
            noise_t[0][0] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0,0.6,0.3)
            noise_t[0][1] = max(epsilon,0)*ou.function(a_t_pridect[0][0],0.5,1,0.1)
            noise_t[0][2] = max(epsilon,0)*ou.function(a_t_pridect[0][0],-0.1,1,0.05)
            a_t = a_t_pridect+noise_t
            # if(CURRENT_STEP<10000) and  j<50:
            #      a_t[0][2]=0
            #      a_t[0][1]=max(0.6,a_t[0][1])
            try:
                ob,r_t,done = env.step(a_t[0])
                s_t1 = ob
                if s_t1 is None or len(s_t1)<514:
                    continue
                buffer_dict[branch_st].add(s_t,a_t[0],r_t,s_t1,done)
            except Exception:
                break

            

            # train Actor and  Critic
            branch_to_train = random.choice([0,1,2,3])
            if buffer_dict[branch_to_train].count()>128:
                train_ddpg(actor,critic,buffer_dict,BATCH_SIZE,branch_to_train)
            total_reward+=r_t
            s_t = s_t1
            CURRENT_STEP+=1
            episode_step+=1
            if (done):
                break
        
        print("buffer lenth:{},{},{},{},total reward:{},current_step:{},total_step:{}".format(buffer_dict[0].count(),
                    buffer_dict[1].count(),
                    buffer_dict[2].count(),
                    buffer_dict[3].count(),
                    total_reward,episode_step,CURRENT_STEP))
        
        if np.mod(i,2000)==0:
            saver.save(sess,'./model/ddpg_model')
            with open("./episode.txt","w") as log:
                log.write(("{},{}\n").format(i,epsilon))
            with open("./buffer.pkl","wb") as buffer_log:
                pickle.dump(buffer_dict, buffer_log)