Beispiel #1
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
Beispiel #2
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2 * (self.action_range)
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters (CartPole)
        # self.gamma = 0.99  # discount factor
        # self.tau = 0.01  # for soft update of target parameters

        # Algorithm parameters (Quadcopter)
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # learning rate
        actor_learning_rate = 0.0001
        critic_learning_rate = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, critic_learning_rate)
        self.critic_target = Critic(self.state_size, self.action_size, critic_learning_rate)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
        
        # Score tracker
        self.score = -np.inf
        self.best_score = -np.inf
Beispiel #4
0
class RandomAlgorithm(TensorflowAlgorithm):
    def __init__(self, session, scope, obs_dim, act_dim):
        super(self.__class__, self).__init__(session, scope, obs_dim, act_dim)
        self.noise = OUNoise(act_dim,
                             mu=0,
                             sigma=Context.config['alg.noise_sigma'],
                             theta=Context.config['alg.noise_theta'])

    def predict(self, s):
        return self.noise.noise()

    def train(self, episodes, steps, **callbacks):
        raise NotImplementedError

    def _save_weights(self, path):
        pass

    def _restore_weights(self, path):
        pass
Beispiel #5
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network 1 (w/ Target Network1)
        self.critic1_local = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic1_target = Critic(state_size, action_size,
                                     random_seed).to(device)
        self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Critic Network 2 (w/ Target Network2)
        self.critic2_local = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic2_target = Critic(state_size, action_size,
                                     random_seed).to(device)
        self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_EVERY and LEARN_EVERY steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory."""
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            self.learn()

    def act(self, state):
        """Returns actions for given states as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        """
        self.t_step += 1
        states, actions, rewards, next_states, dones = self.memory.sample()

        # ---------------------------- update critic ---------------------------- #
        # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action
        if POLICY_NOISE > 0.0:
            noise = torch.empty_like(actions).data.normal_(
                0, POLICY_NOISE).to(device)
            noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP)
            # Get predicted next-state actions and Q values from target models
            actions_next = (self.actor_target(next_states) + noise).clamp(
                -1., 1.)
        else:
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)

        # Error Mitigation
        Q1_target = self.critic1_target(next_states, actions_next)
        Q2_target = self.critic2_target(next_states, actions_next)
        Q_targets_next = torch.min(Q1_target, Q2_target)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + dones * GAMMA * Q_targets_next

        # Compute critic1 loss
        Q1_expected = self.critic1_local(states, actions)
        critic1_loss = F.mse_loss(Q1_expected, Q_targets)
        # Minimize the loss
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1)
        self.critic1_optimizer.step()

        # Compute critic2 loss
        Q2_expected = self.critic2_local(states, actions)
        critic2_loss = F.mse_loss(Q2_expected, Q_targets)
        # Minimize the loss
        self.critic2_optimizer.zero_grad()
        critic2_loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1)
        self.critic2_optimizer.step()

        # Delayed Policy Updates
        if self.t_step % UPDATE_ACTOR_EVERY == 0:
            # ---------------------------- update actor ---------------------------- #
            # Compute actor loss
            actions_pred = self.actor_local(states)
            actor_loss = -self.critic1_local(states, actions_pred).mean()
            # Minimize the loss
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # ----------------------- update target networks ----------------------- #
            self.soft_update(self.critic1_local, self.critic1_target, TAU)
            self.soft_update(self.critic2_local, self.critic2_target, TAU)
            self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_models(self):
        torch.save(self.actor_local.state_dict(), actor_solved_model)
        torch.save(self.critic1_local.state_dict(), critic1_solved_model)
        torch.save(self.critic2_local.state_dict(), critic2_solved_model)
Beispiel #6
0
class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done) 

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def save_models(self):
        torch.save(self.actor_local.state_dict(), actor_solved_model)
        torch.save(self.critic_local.state_dict(), critic_solved_model)
Beispiel #7
0
def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
        if not flags.test:    
            # with tf.name_scope(v.name.replace(':0', '')):
            #     variable_summaries(v)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
        err_c_ph = tf.placeholder(tf.float32, [], name='err_c')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        tf.summary.scalar('err_h', err_h_ph)
        tf.summary.scalar('err_c', err_c_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    saver = tf.train.Saver(trainable_var, max_to_keep=3)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(long_path=False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)

        time.sleep(1.0)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'

        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))

        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        ref_action = [0., 0.]
        goal = [0., 0.]
        rnn_h_in = [np.zeros([1, flags.n_hidden]),
                    np.zeros([1, flags.n_hidden])] if flags.rnn_type == 'lstm' else np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_q = []
        epi_err_h = []
        epi_err_c = []
        loop_time = []
        action = [0., 0.]
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route))
            total_reward += reward

            if t > 0 and not (flags.supervision and (episode+1)%10 == 0):
                agent.Add2Mem([depth_stack, 
                               [combined_cmd], 
                               prev_a, 
                               local_next_goal, 
                               [rnn_h_in[0][0], rnn_h_in[1][0]] if flags.rnn_type == 'lstm' else rnn_h_in[0], 
                               action, 
                               reward, 
                               terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, rnn_h_out = agent.ActorPredict([depth_stack], [[combined_cmd]], [prev_a], rnn_h_in)
            action += (exploration_noise.noise() * np.asarray(agent.action_range))

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])
            
            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            if len(agent.memory) > agent.batch_size and not flags.test:
                q, err_h, err_c = agent.Train()
                epi_q.append(np.amax(q))
                epi_err_h.append(err_h)
                epi_err_c.append(err_c)
            
            if result >= 1:
                if len(agent.memory) > agent.batch_size and not flags.test:
                    summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                          q_ph: np.amax(q),
                                                          err_h_ph: np.mean(epi_err_h),
                                                          err_c_ph: np.mean(epi_err_c)})
                    summary_writer.add_summary(summary, T)
                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time))
                print info_train

                episode += 1
                T += 1
                break

            t += 1
            T += 1
            rnn_h_in = rnn_h_out
            rate.sleep()
            loop_time.append(time.time() - start_time)
Beispiel #8
0
def main(robot_name, rviz):
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    # env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                mu=flags.mu,
                                theta=flags.theta,
                                sigma=flags.sigma)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        agent = RDPG(flags, sess)

        trainable_var = tf.trainable_variables()
        sess.run(tf.global_variables_initializer())

        model_dir = os.path.join(flags.model_dir, flags.model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        init_dir = os.path.join(flags.model_dir, flags.init_model_name)

        # summary
        if not flags.testing:
            print "  [*] printing trainable variables"
            for idx, v in enumerate(trainable_var):
                print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()),
                                                      v.name)
                # with tf.name_scope(v.name.replace(':0', '')):
                #     variable_summaries(v)

            reward_ph = tf.placeholder(tf.float32, [], name='reward')
            q_ph = tf.placeholder(tf.float32, [], name='q_pred')
            noise_ph = tf.placeholder(tf.float32, [], name='noise')
            epsilon_ph = tf.placeholder(tf.float32, [], name='epsilon')
            err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
            err_c_ph = tf.placeholder(tf.float32, [], name='err_c')

            tf.summary.scalar('reward', reward_ph)
            tf.summary.scalar('q_estimate', q_ph)
            tf.summary.scalar('noise', noise_ph)
            tf.summary.scalar('epsilon', epsilon_ph)
            tf.summary.scalar('err_h', err_h_ph)
            tf.summary.scalar('err_c', err_c_ph)
            merged = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

        part_var = []
        for idx, v in enumerate(trainable_var):
            if 'actor/online/encoder' in v.name or 'actor/online/controller' in v.name:
                part_var.append(v)
        saver = tf.train.Saver(trainable_var, max_to_keep=5)
        part_saver = tf.train.Saver(part_var, max_to_keep=5)

        # load model
        if 'empty' in flags.init_model_name:
            checkpoint = tf.train.get_checkpoint_state(model_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')
        else:
            checkpoint = tf.train.get_checkpoint_state(init_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                part_saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')

        episode = 0
        T = 0
        epsilon = flags.init_epsilon
        noise = flags.init_noise
        # start training

        room_position = np.array([-1, -1])
        while T < flags.total_steps and not rospy.is_shutdown():
            print ''
            # set robot in a room
            init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            )
            env.SetObjectPose(robot_name,
                              [init_pose[0], init_pose[1], 0., init_pose[2]])
            time.sleep(1.)
            # generate a path to other room
            target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
                room_position)
            # get a long path
            map_plan, real_route = world.GetPath(init_map_pose +
                                                 target_map_pose)

            dynamic_route = copy.deepcopy(real_route)
            env.LongPathPublish(real_route)
            time.sleep(1.)

            if len(map_plan) == 0:
                print 'no path'
                continue

            pose = env.GetSelfStateGT()
            target_table_point_list, cmd_list, check_point_list = world.GetCommandPlan(
                pose, real_route)

            # print 'target_table_point_list:', target_table_point_list
            print 'cmd_list:', cmd_list

            if len(target_table_point_list) == 0:
                print 'no check point'
                continue

            table_goal = target_table_point_list[0]
            goal = world.Table2RealPosition(table_goal)
            env.target_point = goal
            env.distance = np.sqrt(
                np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

            total_reward = 0
            laser = env.GetLaserObservation()
            laser_stack = np.stack([laser, laser, laser], axis=-1)
            action = [0., 0.]
            epi_q = []
            loop_time = []
            t = 0
            terminal = False
            result = 0
            cmd_idx = 0
            status = [0]
            prev_action = [0., 0.]
            err_h_epi, err_c_epi = 0., 0.
            status_cnt = 5
            if np.random.rand() <= epsilon:
                true_flag = True
                print '-------using true actions------'
            else:
                true_flag = False

            prev_state = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            prev_state_2 = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            while not rospy.is_shutdown():
                start_time = time.time()
                if t == 0:
                    print 'current cmd:', cmd_list[cmd_idx]

                terminal, result, reward = env.GetRewardAndTerminate(t)
                total_reward += reward

                if t > 0:
                    status = [1] if result == 1 else [0]
                    agent.Add2Mem(
                        (laser_stack, cmd, cmd_next, cmd_skip, prev_action,
                         local_goal, prev_state, prev_state_2, action, reward,
                         terminal, status, true_action))
                    prev_state = copy.deepcopy(state)
                    prev_state_2 = copy.deepcopy(state_2)
                    prev_action = copy.deepcopy(action)
                if result > 1:
                    break
                elif result == 1:
                    if cmd_list[cmd_idx] == 5:
                        print 'Finish!!!!!!!'
                        break
                    cmd_idx += 1
                    t = 0
                    status_cnt = 0
                    table_goal = target_table_point_list[cmd_idx]
                    goal = world.Table2RealPosition(table_goal)
                    env.target_point = goal
                    env.distance = np.sqrt(
                        np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

                    continue

                local_goal = env.GetLocalPoint(goal)
                env.PathPublish(local_goal)

                cmd = [cmd_list[cmd_idx]]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [cmd_list[cmd_idx + 1]]

                if status_cnt < 5:
                    cmd = [0]
                    cmd_skip = [0]
                else:
                    cmd_next = [0]
                status_cnt += 1

                # print '{}, {}, {}'.format(cmd[0], cmd_next[0], cmd_skip[0])

                laser = env.GetLaserObservation()
                laser_stack = np.stack(
                    [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

                if not flags.testing:
                    if noise > 0:
                        noise -= flags.init_noise / flags.noise_stop_episode
                        epsilon -= flags.init_epsilon / flags.noise_stop_episode

                time1 = time.time() - start_time

                # predict action
                if t > 0:
                    prev_action = action
                if cmd_list[cmd_idx] == 5:
                    local_goal_a = local_goal
                else:
                    local_goal_a = [0., 0.]
                # input_laser, input_cmd, input_cmd_next, prev_status, prev_action, input_goal, prev_state
                action, state, state_2 = agent.ActorPredict(
                    [laser_stack], [cmd], [cmd_next], [cmd_skip], [action],
                    [local_goal_a], prev_state, prev_state_2)

                if not flags.testing:
                    if T < flags.noise_stop_episode:
                        action += (exploration_noise.noise() *
                                   np.asarray(agent.action_range) * noise)

                # get true action
                pose = env.GetSelfStateGT()
                near_goal, dynamic_route = world.GetNextNearGoal(
                    dynamic_route, pose)

                local_near_goal = env.GetLocalPoint(near_goal)
                true_action = env.Controller(local_near_goal, None, 1)

                if true_flag:
                    action = true_action

                env.SelfControl(action, agent.action_range)

                time2 = time.time() - start_time - time1

                if T > agent.batch_size and not flags.testing:
                    q, err_h, err_c = agent.Train()
                    epi_q.append(np.amax(q))
                    err_h_epi += err_h
                    err_c_epi += err_c

                if (T + 1
                    ) % flags.steps_per_checkpoint == 0 and not flags.testing:
                    saver.save(sess,
                               os.path.join(model_dir, 'network'),
                               global_step=T + 1)

                t += 1
                T += 1
                loop_time.append(time.time() - start_time)
                time3 = time.time() - start_time - time1 - time2

                used_time = time.time() - start_time

                if used_time > 0.04:
                    print '{:.4f} | {:.4f} | {:.4f} | {:.4f}'.format(
                        time1, time2, time3, used_time)
                rate.sleep()

            if T > agent.batch_size and not flags.testing and t > 0:
                if not true_flag:
                    summary = sess.run(merged,
                                       feed_dict={
                                           reward_ph: total_reward,
                                           q_ph: np.amax(q),
                                           noise_ph: noise,
                                           epsilon_ph: epsilon,
                                           err_h_ph: err_h_epi / t,
                                           err_c_ph: err_c_epi / t
                                       })
                    summary_writer.add_summary(summary, T)
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Q:{:.2f} | Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.amax(q),
                    np.mean(loop_time))

            else:
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} |  Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.mean(loop_time))
            episode += 1
Beispiel #9
0
class DDPGAgent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = .5 #0
        self.exploration_theta = 0.6 #0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        #Agent score parameters
        self.score=0.0
        self.best_score=-1e10
        self.worst_score=1e10

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)
Beispiel #10
0
def testing(sess, model):
    # --------------------load model-------------------------
    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    init_dir = os.path.join(flags.model_dir, flags.init_model_name)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    all_var = tf.global_variables()
    load_var = all_var[:21]
    trainable_var = tf.trainable_variables()
    part_var = []
    for idx, v in enumerate(load_var):
        print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
        if 'encoder' in v.name or 'controller' in v.name:
            part_var.append(v)
    saver = tf.train.Saver(trainable_var, max_to_keep=5)
    part_saver = tf.train.Saver(load_var, max_to_keep=1)

    # load model
    checkpoint = tf.train.get_checkpoint_state(model_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        part_saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Network model loaded: ", checkpoint.model_checkpoint_path)
    else:
        print('No model is found')

    # ----------------------init env-------------------------
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    robot_name = 'robot1'
    rviz = False
    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    init_pose = world.RandomInitPose()
    env.target_point = init_pose

    episode = 0
    T = 0

    # ------------------start to test------------------------
    room_position = np.array([-1, -1])
    SR_cnt = 0
    TIME_list = []
    POSE_list = []
    while not rospy.is_shutdown():
        print ''
        time.sleep(1.)
        # set robot in a room
        init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            room_position)
        env.SetObjectPose(robot_name,
                          [init_pose[0], init_pose[1], 0., init_pose[2]])
        # generate a path to other room
        target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
            room_position)
        # get a long path
        map_plan, real_route = world.GetPath(init_map_pose + target_map_pose)

        dynamic_route = copy.deepcopy(real_route)
        env.LongPathPublish(real_route)

        if len(map_plan) == 0:
            print 'no path'
            continue

        pose = env.GetSelfStateGT()
        target_table_point_list, cmd_list, checkpoint_list = world.GetCommandPlan(
            pose, real_route)
        cmd_list += [0]
        print 'target_table_point_list:', target_table_point_list
        print 'cmd_list:', cmd_list

        if len(target_table_point_list) == 0:
            print 'no check point'
            continue

        table_goal = target_table_point_list[0]
        goal = world.Table2RealPosition(table_goal)
        env.target_point = goal
        env.distance = np.sqrt(
            np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

        total_reward = 0
        laser = env.GetLaserObservation()
        laser_stack = np.stack([laser, laser, laser], axis=-1)
        prev_action = [0., 0.]
        epi_q = []
        loop_time = []
        t = 0
        terminal = False
        result = 0
        cmd_idx = 0
        prob = 0.
        used_action = [0., 0.]
        true_action = [0., 0.]
        logits = [0., 0.]
        cnt = 5
        status = 0
        prev_status = 0
        exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                    mu=flags.mu,
                                    theta=flags.theta,
                                    sigma=flags.sigma)
        epi_start_time = time.time()
        positions = []
        CMD_list = []
        while not rospy.is_shutdown():
            start_time = time.time()
            if t == 0:
                print 'current cmd:', cmd_list[cmd_idx]
            prev_result = result
            terminal, result, reward = env.GetRewardAndTerminate(t)
            total_reward += reward
            # if t > 0:
            #     print  '{}, {}, | {:.3f}, {:.3f} | {}, {}, {}'.format(result, status, logits[0], logits[1], cmd[0], cmd_next[0], cmd_skip[0])
            true_status = 0
            if result > 1:
                break
            # elif result == 1:
            #     true_status = 1
            #     if cmd_list[cmd_idx] == 5:
            #         print 'Finish!!!!!!!'
            #         break
            #     cmd_idx += 1
            #     t = 0
            #     table_goal = target_table_point_list[cmd_idx]
            #     goal = world.Table2RealPosition(table_goal)
            #     env.target_point = goal
            #     env.distance = np.sqrt(np.linalg.norm([pose[0]-goal[0], pose[1]-goal[1]]))
            #     continue

            local_goal = env.GetLocalPoint(goal)
            env.PathPublish(local_goal)

            if cnt < 5:
                cnt += 1
                cmd = [0]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [0]
            else:
                cmd = [cmd_list[cmd_idx]]
                cmd_next = [0]
                cmd_skip = [cmd_list[cmd_idx + 1]]
            CMD_list.append(cmd[0])
            curr_status = [cmd[0] * flags.n_cmd_type + cmd_next[0]]
            next_status = [cmd_next[0] * flags.n_cmd_type + cmd_skip[0]]
            prev_laser = laser
            laser = env.GetLaserObservation()
            if np.random.rand():
                pass
            laser_stack = np.stack(
                [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

            # predict action
            if cmd_list[cmd_idx] == 5:
                local_goal_a = local_goal
            else:
                local_goal_a = [0., 0.]

            prev_status = copy.deepcopy(status)
            status, action, logits, _ = model.Predict(
                [laser_stack], [curr_status], [next_status], [local_goal_a], t,
                [[used_action[0], used_action[1]]])
            action += (exploration_noise.noise() *
                       np.asarray(model.action_range) * 0.1)
            if prev_status == 0 and status == 1 and cnt >= 5:
                if cmd_list[cmd_idx] == 5:
                    print 'Finish!!!!!!!'
                    break
                cmd_idx += 1
                t = 0
                cnt = 0
                table_goal = target_table_point_list[cmd_idx]
                goal = world.Table2RealPosition(table_goal)
                env.target_point = goal
                env.distance = np.sqrt(
                    np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))
                continue

            # get action
            pose = env.GetSelfStateGT()
            near_goal, dynamic_route = world.GetNextNearGoal(
                dynamic_route, pose)

            local_near_goal = env.GetLocalPoint(near_goal)
            true_action = env.Controller(local_near_goal, None, 1)
            positions.append(pose[:2])
            # print action - np.asarray(true_action), action
            if cmd_list[cmd_idx] == 0:
                used_action = true_action
            else:
                used_action = action

            env.SelfControl(used_action,
                            [flags.a_linear_range, flags.a_angular_range])

            t += 1
            T += 1
            loop_time.append(time.time() - start_time)
            rate.sleep()

        print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:}'.format(
            episode, t, total_reward, T)
        episode += 1
        if cmd[0] == 5:
            SR_cnt += 1.0
            TIME_list.append(time.time() - epi_start_time)
            # LogData(positions, CMD_list, int(SR_cnt), os.path.join(CWD, 'experiments/positions'))
            print 'SR: {:.3f} | AVG TIME: {:.3f}, | MAX TIME: {:.3f}'.format(
                SR_cnt / episode, np.mean(TIME_list), np.amax(TIME_list))
Beispiel #11
0
 def __init__(self, session, scope, obs_dim, act_dim):
     super(self.__class__, self).__init__(session, scope, obs_dim, act_dim)
     self.noise = OUNoise(act_dim,
                          mu=0,
                          sigma=Context.config['alg.noise_sigma'],
                          theta=Context.config['alg.noise_theta'])
def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG_BPTT(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    if not flags.test:
        saver = tf.train.Saver(max_to_keep=5, save_relative_paths=True)
    else:
        saver = tf.train.Saver(trainable_var)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    timeout_flag = False
    noise_annealing = 1.
    success_nums = np.zeros([10], dtype=np.float32)
    demo_lens = np.zeros([10], dtype=np.float32)
    results_nums = np.zeros([3], dtype=np.float32)
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            env.SetObjectPose(robot_name, [-1., -1., 0., 0.], once=True)
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)
        
        time.sleep(0.1)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq, cmd_list = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'
        
        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))
        
        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        action = [0., 0.]
        goal = [0., 0.]
        gru_h_in = np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_err_h = []
        loop_time = []
        data_seq = []
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route),
                                                                  test= True if flags.test else False)
            total_reward += reward

            if t > 0:
                data_seq.append([depth_stack, [cmd], prev_a, action, reward, terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            # add noise
            if flags.test:
                if cmd == 0:
                    cmd = 2
            elif cmd != 2 and np.random.rand() < 0.1:
                cmd = np.random.randint(4)

            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, gru_h_out = agent.ActorPredict([depth_stack], [[cmd]], [prev_a], gru_h_in)
            if flags.test:
                action += (exploration_noise.noise() * np.asarray(agent.action_range)) * noise_annealing

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])

            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            training_step_time = 0.
            if result >= 1:
                if not (flags.supervision and (episode+1)%10 == 0):
                    agent.Add2Mem(data_seq)

                if len(agent.memory) > agent.batch_size and not flags.test:
                    training_step_start_time = time.time()
                    q = agent.Train()
                    training_step_time = time.time() - training_step_start_time
                    if t > 1:
                        summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                              q_ph: np.amax(q)
                                                              })
                        summary_writer.add_summary(summary, T)

                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time)) + \
                             '| OpStepT(s): {:.3f}'.format(training_step_time)
                print info_train
                episode += 1
                demo_cnt = min(len(cmd_list), 10)
                demo_lens[demo_cnt-1] += 1
                if result == 2:
                    success_nums[demo_cnt-1] += 1
                    results_nums[0] += 1
                elif result in [3, 4]:
                    results_nums[2] += 1
                elif result == 1:
                    results_nums[1] += 1
                if flags.test and episode == 1000:
                    print 'success num distributs: ', success_nums
                    print 'demo length distributs: ', demo_lens
                    demo_lens[demo_lens==0] = 1e-12
                    print 'success rate distributs: ', success_nums/demo_lens
                    print 'results nums: ', results_nums
                    return True
                break

            t += 1
            T += 1
            noise_annealing -= 1/flags.max_training_step
            gru_h_in = gru_h_out
            loop_time.append(time.time() - start_time)
            rate.sleep()
Beispiel #13
0
 def _create_exploration(self):
     return OUNoise(self._act_dim, mu=0,
                    sigma=Context.config['alg.noise_sigma'],
                    theta=Context.config['alg.noise_theta'])