Python OUNoise Beispiele

Programmiersprache: Python

Namespace / Paketname: utils.ou_noise

Klasse / Typ: OUNoise

Beispiele auf hotexamples.com: 13

Python OUNoise - 13 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die utils.ou_noise.OUNoise, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

OUNoise(9)

noise(5)

reset(3)

sample(3)

Häufig verwendete Methoden

OUNoise (9)

noise (5)

reset (3)

sample (3)

Beispiel #1

Datei anzeigen

    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)

Beispiel #2

Datei anzeigen

    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.action_range = self.action_high - self.action_low

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2 * (self.action_range)
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters (CartPole)
        # self.gamma = 0.99  # discount factor
        # self.tau = 0.01  # for soft update of target parameters

        # Algorithm parameters (Quadcopter)
        self.gamma = 0.99  # discount factor
        self.tau = 0.01  # for soft update of target parameters

Beispiel #3

Datei anzeigen

Datei: agent.py Projekt: oveis/Deep_Learning_Nanodegree

    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        
        # learning rate
        actor_learning_rate = 0.0001
        critic_learning_rate = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_learning_rate)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, critic_learning_rate)
        self.critic_target = Critic(self.state_size, self.action_size, critic_learning_rate)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
        
        # Score tracker
        self.score = -np.inf
        self.best_score = -np.inf

Beispiel #4

Datei anzeigen

Datei: random_alg.py Projekt: RomanKoshelev/zoo

class RandomAlgorithm(TensorflowAlgorithm):
    def __init__(self, session, scope, obs_dim, act_dim):
        super(self.__class__, self).__init__(session, scope, obs_dim, act_dim)
        self.noise = OUNoise(act_dim,
                             mu=0,
                             sigma=Context.config['alg.noise_sigma'],
                             theta=Context.config['alg.noise_theta'])

    def predict(self, s):
        return self.noise.noise()

    def train(self, episodes, steps, **callbacks):
        raise NotImplementedError

    def _save_weights(self, path):
        pass

    def _restore_weights(self, path):
        pass

Beispiel #5

Datei anzeigen

class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network 1 (w/ Target Network1)
        self.critic1_local = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic1_target = Critic(state_size, action_size,
                                     random_seed).to(device)
        self.critic1_optimizer = optim.Adam(self.critic1_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Critic Network 2 (w/ Target Network2)
        self.critic2_local = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic2_target = Critic(state_size, action_size,
                                     random_seed).to(device)
        self.critic2_optimizer = optim.Adam(self.critic2_local.parameters(),
                                            lr=LR_CRITIC,
                                            weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_EVERY and LEARN_EVERY steps)
        self.t_step = 0

    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory."""
        for state, action, reward, next_state, done in zip(
                states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            self.learn()

    def act(self, state):
        """Returns actions for given states as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()

        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()

        action += self.noise.sample()

        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        """
        self.t_step += 1
        states, actions, rewards, next_states, dones = self.memory.sample()

        # ---------------------------- update critic ---------------------------- #
        # Target Policy Smoothing Regularization: add a small amount of clipped random noises to the selected action
        if POLICY_NOISE > 0.0:
            noise = torch.empty_like(actions).data.normal_(
                0, POLICY_NOISE).to(device)
            noise = noise.clamp(-POLICY_NOISE_CLIP, POLICY_NOISE_CLIP)
            # Get predicted next-state actions and Q values from target models
            actions_next = (self.actor_target(next_states) + noise).clamp(
                -1., 1.)
        else:
            # Get predicted next-state actions and Q values from target models
            actions_next = self.actor_target(next_states)

        # Error Mitigation
        Q1_target = self.critic1_target(next_states, actions_next)
        Q2_target = self.critic2_target(next_states, actions_next)
        Q_targets_next = torch.min(Q1_target, Q2_target)

        # Compute Q targets for current states (y_i)
        Q_targets = rewards + dones * GAMMA * Q_targets_next

        # Compute critic1 loss
        Q1_expected = self.critic1_local(states, actions)
        critic1_loss = F.mse_loss(Q1_expected, Q_targets)
        # Minimize the loss
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(self.critic1_local.parameters(), 1)
        self.critic1_optimizer.step()

        # Compute critic2 loss
        Q2_expected = self.critic2_local(states, actions)
        critic2_loss = F.mse_loss(Q2_expected, Q_targets)
        # Minimize the loss
        self.critic2_optimizer.zero_grad()
        critic2_loss.backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(self.critic2_local.parameters(), 1)
        self.critic2_optimizer.step()

        # Delayed Policy Updates
        if self.t_step % UPDATE_ACTOR_EVERY == 0:
            # ---------------------------- update actor ---------------------------- #
            # Compute actor loss
            actions_pred = self.actor_local(states)
            actor_loss = -self.critic1_local(states, actions_pred).mean()
            # Minimize the loss
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

            # ----------------------- update target networks ----------------------- #
            self.soft_update(self.critic1_local, self.critic1_target, TAU)
            self.soft_update(self.critic2_local, self.critic2_target, TAU)
            self.soft_update(self.actor_local, self.actor_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

    def save_models(self):
        torch.save(self.actor_local.state_dict(), actor_solved_model)
        torch.save(self.critic1_local.state_dict(), critic1_solved_model)
        torch.save(self.critic2_local.state_dict(), critic2_solved_model)

Beispiel #6

Datei anzeigen

class Agent():
    """Interacts with and learns from the environment."""
    
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
    
    def step(self, states, actions, rewards, next_states, dones):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
            self.memory.add(state, action, reward, next_state, done) 

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states)
        Q_targets_next = self.critic_target(next_states, actions_next)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)                     

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def save_models(self):
        torch.save(self.actor_local.state_dict(), actor_solved_model)
        torch.save(self.critic_local.state_dict(), critic_solved_model)

Beispiel #7

Datei anzeigen

def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
        if not flags.test:    
            # with tf.name_scope(v.name.replace(':0', '')):
            #     variable_summaries(v)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
        err_c_ph = tf.placeholder(tf.float32, [], name='err_c')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        tf.summary.scalar('err_h', err_h_ph)
        tf.summary.scalar('err_c', err_c_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    saver = tf.train.Saver(trainable_var, max_to_keep=3)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(long_path=False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)

        time.sleep(1.0)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'

        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))

        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        ref_action = [0., 0.]
        goal = [0., 0.]
        rnn_h_in = [np.zeros([1, flags.n_hidden]),
                    np.zeros([1, flags.n_hidden])] if flags.rnn_type == 'lstm' else np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_q = []
        epi_err_h = []
        epi_err_c = []
        loop_time = []
        action = [0., 0.]
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route))
            total_reward += reward

            if t > 0 and not (flags.supervision and (episode+1)%10 == 0):
                agent.Add2Mem([depth_stack, 
                               [combined_cmd], 
                               prev_a, 
                               local_next_goal, 
                               [rnn_h_in[0][0], rnn_h_in[1][0]] if flags.rnn_type == 'lstm' else rnn_h_in[0], 
                               action, 
                               reward, 
                               terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, rnn_h_out = agent.ActorPredict([depth_stack], [[combined_cmd]], [prev_a], rnn_h_in)
            action += (exploration_noise.noise() * np.asarray(agent.action_range))

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])
            
            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            if len(agent.memory) > agent.batch_size and not flags.test:
                q, err_h, err_c = agent.Train()
                epi_q.append(np.amax(q))
                epi_err_h.append(err_h)
                epi_err_c.append(err_c)
            
            if result >= 1:
                if len(agent.memory) > agent.batch_size and not flags.test:
                    summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                          q_ph: np.amax(q),
                                                          err_h_ph: np.mean(epi_err_h),
                                                          err_c_ph: np.mean(epi_err_c)})
                    summary_writer.add_summary(summary, T)
                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time))
                print info_train

                episode += 1
                T += 1
                break

            t += 1
            T += 1
            rnn_h_in = rnn_h_out
            rate.sleep()
            loop_time.append(time.time() - start_time)

Beispiel #8

Datei anzeigen

Datei: training_rdpg.py Projekt: xie9187/lan_nav

def main(robot_name, rviz):
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    # env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                mu=flags.mu,
                                theta=flags.theta,
                                sigma=flags.sigma)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        agent = RDPG(flags, sess)

        trainable_var = tf.trainable_variables()
        sess.run(tf.global_variables_initializer())

        model_dir = os.path.join(flags.model_dir, flags.model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        init_dir = os.path.join(flags.model_dir, flags.init_model_name)

        # summary
        if not flags.testing:
            print "  [*] printing trainable variables"
            for idx, v in enumerate(trainable_var):
                print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()),
                                                      v.name)
                # with tf.name_scope(v.name.replace(':0', '')):
                #     variable_summaries(v)

            reward_ph = tf.placeholder(tf.float32, [], name='reward')
            q_ph = tf.placeholder(tf.float32, [], name='q_pred')
            noise_ph = tf.placeholder(tf.float32, [], name='noise')
            epsilon_ph = tf.placeholder(tf.float32, [], name='epsilon')
            err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
            err_c_ph = tf.placeholder(tf.float32, [], name='err_c')

            tf.summary.scalar('reward', reward_ph)
            tf.summary.scalar('q_estimate', q_ph)
            tf.summary.scalar('noise', noise_ph)
            tf.summary.scalar('epsilon', epsilon_ph)
            tf.summary.scalar('err_h', err_h_ph)
            tf.summary.scalar('err_c', err_c_ph)
            merged = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

        part_var = []
        for idx, v in enumerate(trainable_var):
            if 'actor/online/encoder' in v.name or 'actor/online/controller' in v.name:
                part_var.append(v)
        saver = tf.train.Saver(trainable_var, max_to_keep=5)
        part_saver = tf.train.Saver(part_var, max_to_keep=5)

        # load model
        if 'empty' in flags.init_model_name:
            checkpoint = tf.train.get_checkpoint_state(model_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')
        else:
            checkpoint = tf.train.get_checkpoint_state(init_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                part_saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')

        episode = 0
        T = 0
        epsilon = flags.init_epsilon
        noise = flags.init_noise
        # start training

        room_position = np.array([-1, -1])
        while T < flags.total_steps and not rospy.is_shutdown():
            print ''
            # set robot in a room
            init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            )
            env.SetObjectPose(robot_name,
                              [init_pose[0], init_pose[1], 0., init_pose[2]])
            time.sleep(1.)
            # generate a path to other room
            target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
                room_position)
            # get a long path
            map_plan, real_route = world.GetPath(init_map_pose +
                                                 target_map_pose)

            dynamic_route = copy.deepcopy(real_route)
            env.LongPathPublish(real_route)
            time.sleep(1.)

            if len(map_plan) == 0:
                print 'no path'
                continue

            pose = env.GetSelfStateGT()
            target_table_point_list, cmd_list, check_point_list = world.GetCommandPlan(
                pose, real_route)

            # print 'target_table_point_list:', target_table_point_list
            print 'cmd_list:', cmd_list

            if len(target_table_point_list) == 0:
                print 'no check point'
                continue

            table_goal = target_table_point_list[0]
            goal = world.Table2RealPosition(table_goal)
            env.target_point = goal
            env.distance = np.sqrt(
                np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

            total_reward = 0
            laser = env.GetLaserObservation()
            laser_stack = np.stack([laser, laser, laser], axis=-1)
            action = [0., 0.]
            epi_q = []
            loop_time = []
            t = 0
            terminal = False
            result = 0
            cmd_idx = 0
            status = [0]
            prev_action = [0., 0.]
            err_h_epi, err_c_epi = 0., 0.
            status_cnt = 5
            if np.random.rand() <= epsilon:
                true_flag = True
                print '-------using true actions------'
            else:
                true_flag = False

            prev_state = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            prev_state_2 = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            while not rospy.is_shutdown():
                start_time = time.time()
                if t == 0:
                    print 'current cmd:', cmd_list[cmd_idx]

                terminal, result, reward = env.GetRewardAndTerminate(t)
                total_reward += reward

                if t > 0:
                    status = [1] if result == 1 else [0]
                    agent.Add2Mem(
                        (laser_stack, cmd, cmd_next, cmd_skip, prev_action,
                         local_goal, prev_state, prev_state_2, action, reward,
                         terminal, status, true_action))
                    prev_state = copy.deepcopy(state)
                    prev_state_2 = copy.deepcopy(state_2)
                    prev_action = copy.deepcopy(action)
                if result > 1:
                    break
                elif result == 1:
                    if cmd_list[cmd_idx] == 5:
                        print 'Finish!!!!!!!'
                        break
                    cmd_idx += 1
                    t = 0
                    status_cnt = 0
                    table_goal = target_table_point_list[cmd_idx]
                    goal = world.Table2RealPosition(table_goal)
                    env.target_point = goal
                    env.distance = np.sqrt(
                        np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

                    continue

                local_goal = env.GetLocalPoint(goal)
                env.PathPublish(local_goal)

                cmd = [cmd_list[cmd_idx]]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [cmd_list[cmd_idx + 1]]

                if status_cnt < 5:
                    cmd = [0]
                    cmd_skip = [0]
                else:
                    cmd_next = [0]
                status_cnt += 1

                # print '{}, {}, {}'.format(cmd[0], cmd_next[0], cmd_skip[0])

                laser = env.GetLaserObservation()
                laser_stack = np.stack(
                    [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

                if not flags.testing:
                    if noise > 0:
                        noise -= flags.init_noise / flags.noise_stop_episode
                        epsilon -= flags.init_epsilon / flags.noise_stop_episode

                time1 = time.time() - start_time

                # predict action
                if t > 0:
                    prev_action = action
                if cmd_list[cmd_idx] == 5:
                    local_goal_a = local_goal
                else:
                    local_goal_a = [0., 0.]
                # input_laser, input_cmd, input_cmd_next, prev_status, prev_action, input_goal, prev_state
                action, state, state_2 = agent.ActorPredict(
                    [laser_stack], [cmd], [cmd_next], [cmd_skip], [action],
                    [local_goal_a], prev_state, prev_state_2)

                if not flags.testing:
                    if T < flags.noise_stop_episode:
                        action += (exploration_noise.noise() *
                                   np.asarray(agent.action_range) * noise)

                # get true action
                pose = env.GetSelfStateGT()
                near_goal, dynamic_route = world.GetNextNearGoal(
                    dynamic_route, pose)

                local_near_goal = env.GetLocalPoint(near_goal)
                true_action = env.Controller(local_near_goal, None, 1)

                if true_flag:
                    action = true_action

                env.SelfControl(action, agent.action_range)

                time2 = time.time() - start_time - time1

                if T > agent.batch_size and not flags.testing:
                    q, err_h, err_c = agent.Train()
                    epi_q.append(np.amax(q))
                    err_h_epi += err_h
                    err_c_epi += err_c

                if (T + 1
                    ) % flags.steps_per_checkpoint == 0 and not flags.testing:
                    saver.save(sess,
                               os.path.join(model_dir, 'network'),
                               global_step=T + 1)

                t += 1
                T += 1
                loop_time.append(time.time() - start_time)
                time3 = time.time() - start_time - time1 - time2

                used_time = time.time() - start_time

                if used_time > 0.04:
                    print '{:.4f} | {:.4f} | {:.4f} | {:.4f}'.format(
                        time1, time2, time3, used_time)
                rate.sleep()

            if T > agent.batch_size and not flags.testing and t > 0:
                if not true_flag:
                    summary = sess.run(merged,
                                       feed_dict={
                                           reward_ph: total_reward,
                                           q_ph: np.amax(q),
                                           noise_ph: noise,
                                           epsilon_ph: epsilon,
                                           err_h_ph: err_h_epi / t,
                                           err_c_ph: err_c_epi / t
                                       })
                    summary_writer.add_summary(summary, T)
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Q:{:.2f} | Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.amax(q),
                    np.mean(loop_time))

            else:
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} |  Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.mean(loop_time))
            episode += 1

Beispiel #9

Datei anzeigen

class DDPGAgent():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = .5 #0
        self.exploration_theta = 0.6 #0.15
        self.exploration_sigma = 0.3
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        #Agent score parameters
        self.score=0.0
        self.best_score=-1e10
        self.worst_score=1e10

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
         # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action + self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack([e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients, 1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 - self.tau) * target_weights
        target_model.set_weights(new_weights)

Beispiel #10

Datei anzeigen

Datei: training_no_tr.py Projekt: xie9187/lan_nav

def testing(sess, model):
    # --------------------load model-------------------------
    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    init_dir = os.path.join(flags.model_dir, flags.init_model_name)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    all_var = tf.global_variables()
    load_var = all_var[:21]
    trainable_var = tf.trainable_variables()
    part_var = []
    for idx, v in enumerate(load_var):
        print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
        if 'encoder' in v.name or 'controller' in v.name:
            part_var.append(v)
    saver = tf.train.Saver(trainable_var, max_to_keep=5)
    part_saver = tf.train.Saver(load_var, max_to_keep=1)

    # load model
    checkpoint = tf.train.get_checkpoint_state(model_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        part_saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Network model loaded: ", checkpoint.model_checkpoint_path)
    else:
        print('No model is found')

    # ----------------------init env-------------------------
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    robot_name = 'robot1'
    rviz = False
    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    init_pose = world.RandomInitPose()
    env.target_point = init_pose

    episode = 0
    T = 0

    # ------------------start to test------------------------
    room_position = np.array([-1, -1])
    SR_cnt = 0
    TIME_list = []
    POSE_list = []
    while not rospy.is_shutdown():
        print ''
        time.sleep(1.)
        # set robot in a room
        init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            room_position)
        env.SetObjectPose(robot_name,
                          [init_pose[0], init_pose[1], 0., init_pose[2]])
        # generate a path to other room
        target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
            room_position)
        # get a long path
        map_plan, real_route = world.GetPath(init_map_pose + target_map_pose)

        dynamic_route = copy.deepcopy(real_route)
        env.LongPathPublish(real_route)

        if len(map_plan) == 0:
            print 'no path'
            continue

        pose = env.GetSelfStateGT()
        target_table_point_list, cmd_list, checkpoint_list = world.GetCommandPlan(
            pose, real_route)
        cmd_list += [0]
        print 'target_table_point_list:', target_table_point_list
        print 'cmd_list:', cmd_list

        if len(target_table_point_list) == 0:
            print 'no check point'
            continue

        table_goal = target_table_point_list[0]
        goal = world.Table2RealPosition(table_goal)
        env.target_point = goal
        env.distance = np.sqrt(
            np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

        total_reward = 0
        laser = env.GetLaserObservation()
        laser_stack = np.stack([laser, laser, laser], axis=-1)
        prev_action = [0., 0.]
        epi_q = []
        loop_time = []
        t = 0
        terminal = False
        result = 0
        cmd_idx = 0
        prob = 0.
        used_action = [0., 0.]
        true_action = [0., 0.]
        logits = [0., 0.]
        cnt = 5
        status = 0
        prev_status = 0
        exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                    mu=flags.mu,
                                    theta=flags.theta,
                                    sigma=flags.sigma)
        epi_start_time = time.time()
        positions = []
        CMD_list = []
        while not rospy.is_shutdown():
            start_time = time.time()
            if t == 0:
                print 'current cmd:', cmd_list[cmd_idx]
            prev_result = result
            terminal, result, reward = env.GetRewardAndTerminate(t)
            total_reward += reward
            # if t > 0:
            #     print  '{}, {}, | {:.3f}, {:.3f} | {}, {}, {}'.format(result, status, logits[0], logits[1], cmd[0], cmd_next[0], cmd_skip[0])
            true_status = 0
            if result > 1:
                break
            # elif result == 1:
            #     true_status = 1
            #     if cmd_list[cmd_idx] == 5:
            #         print 'Finish!!!!!!!'
            #         break
            #     cmd_idx += 1
            #     t = 0
            #     table_goal = target_table_point_list[cmd_idx]
            #     goal = world.Table2RealPosition(table_goal)
            #     env.target_point = goal
            #     env.distance = np.sqrt(np.linalg.norm([pose[0]-goal[0], pose[1]-goal[1]]))
            #     continue

            local_goal = env.GetLocalPoint(goal)
            env.PathPublish(local_goal)

            if cnt < 5:
                cnt += 1
                cmd = [0]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [0]
            else:
                cmd = [cmd_list[cmd_idx]]
                cmd_next = [0]
                cmd_skip = [cmd_list[cmd_idx + 1]]
            CMD_list.append(cmd[0])
            curr_status = [cmd[0] * flags.n_cmd_type + cmd_next[0]]
            next_status = [cmd_next[0] * flags.n_cmd_type + cmd_skip[0]]
            prev_laser = laser
            laser = env.GetLaserObservation()
            if np.random.rand():
                pass
            laser_stack = np.stack(
                [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

            # predict action
            if cmd_list[cmd_idx] == 5:
                local_goal_a = local_goal
            else:
                local_goal_a = [0., 0.]

            prev_status = copy.deepcopy(status)
            status, action, logits, _ = model.Predict(
                [laser_stack], [curr_status], [next_status], [local_goal_a], t,
                [[used_action[0], used_action[1]]])
            action += (exploration_noise.noise() *
                       np.asarray(model.action_range) * 0.1)
            if prev_status == 0 and status == 1 and cnt >= 5:
                if cmd_list[cmd_idx] == 5:
                    print 'Finish!!!!!!!'
                    break
                cmd_idx += 1
                t = 0
                cnt = 0
                table_goal = target_table_point_list[cmd_idx]
                goal = world.Table2RealPosition(table_goal)
                env.target_point = goal
                env.distance = np.sqrt(
                    np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))
                continue

            # get action
            pose = env.GetSelfStateGT()
            near_goal, dynamic_route = world.GetNextNearGoal(
                dynamic_route, pose)

            local_near_goal = env.GetLocalPoint(near_goal)
            true_action = env.Controller(local_near_goal, None, 1)
            positions.append(pose[:2])
            # print action - np.asarray(true_action), action
            if cmd_list[cmd_idx] == 0:
                used_action = true_action
            else:
                used_action = action

            env.SelfControl(used_action,
                            [flags.a_linear_range, flags.a_angular_range])

            t += 1
            T += 1
            loop_time.append(time.time() - start_time)
            rate.sleep()

        print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:}'.format(
            episode, t, total_reward, T)
        episode += 1
        if cmd[0] == 5:
            SR_cnt += 1.0
            TIME_list.append(time.time() - epi_start_time)
            # LogData(positions, CMD_list, int(SR_cnt), os.path.join(CWD, 'experiments/positions'))
            print 'SR: {:.3f} | AVG TIME: {:.3f}, | MAX TIME: {:.3f}'.format(
                SR_cnt / episode, np.mean(TIME_list), np.amax(TIME_list))

Beispiel #11

Datei anzeigen

Datei: random_alg.py Projekt: RomanKoshelev/zoo

 def __init__(self, session, scope, obs_dim, act_dim):
     super(self.__class__, self).__init__(session, scope, obs_dim, act_dim)
     self.noise = OUNoise(act_dim,
                          mu=0,
                          sigma=Context.config['alg.noise_sigma'],
                          theta=Context.config['alg.noise_theta'])

Beispiel #12

Datei anzeigen

Datei: training_rdpg_BPTT.py Projekt: xie9187/visual_path_follow

def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG_BPTT(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    if not flags.test:
        saver = tf.train.Saver(max_to_keep=5, save_relative_paths=True)
    else:
        saver = tf.train.Saver(trainable_var)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    timeout_flag = False
    noise_annealing = 1.
    success_nums = np.zeros([10], dtype=np.float32)
    demo_lens = np.zeros([10], dtype=np.float32)
    results_nums = np.zeros([3], dtype=np.float32)
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            env.SetObjectPose(robot_name, [-1., -1., 0., 0.], once=True)
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)
        
        time.sleep(0.1)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq, cmd_list = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'
        
        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))
        
        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        action = [0., 0.]
        goal = [0., 0.]
        gru_h_in = np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_err_h = []
        loop_time = []
        data_seq = []
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route),
                                                                  test= True if flags.test else False)
            total_reward += reward

            if t > 0:
                data_seq.append([depth_stack, [cmd], prev_a, action, reward, terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            # add noise
            if flags.test:
                if cmd == 0:
                    cmd = 2
            elif cmd != 2 and np.random.rand() < 0.1:
                cmd = np.random.randint(4)

            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, gru_h_out = agent.ActorPredict([depth_stack], [[cmd]], [prev_a], gru_h_in)
            if flags.test:
                action += (exploration_noise.noise() * np.asarray(agent.action_range)) * noise_annealing

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])

            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            training_step_time = 0.
            if result >= 1:
                if not (flags.supervision and (episode+1)%10 == 0):
                    agent.Add2Mem(data_seq)

                if len(agent.memory) > agent.batch_size and not flags.test:
                    training_step_start_time = time.time()
                    q = agent.Train()
                    training_step_time = time.time() - training_step_start_time
                    if t > 1:
                        summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                              q_ph: np.amax(q)
                                                              })
                        summary_writer.add_summary(summary, T)

                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time)) + \
                             '| OpStepT(s): {:.3f}'.format(training_step_time)
                print info_train
                episode += 1
                demo_cnt = min(len(cmd_list), 10)
                demo_lens[demo_cnt-1] += 1
                if result == 2:
                    success_nums[demo_cnt-1] += 1
                    results_nums[0] += 1
                elif result in [3, 4]:
                    results_nums[2] += 1
                elif result == 1:
                    results_nums[1] += 1
                if flags.test and episode == 1000:
                    print 'success num distributs: ', success_nums
                    print 'demo length distributs: ', demo_lens
                    demo_lens[demo_lens==0] = 1e-12
                    print 'success rate distributs: ', success_nums/demo_lens
                    print 'results nums: ', results_nums
                    return True
                break

            t += 1
            T += 1
            noise_annealing -= 1/flags.max_training_step
            gru_h_in = gru_h_out
            loop_time.append(time.time() - start_time)
            rate.sleep()

Beispiel #13

Datei anzeigen

 def _create_exploration(self):
     return OUNoise(self._act_dim, mu=0,
                    sigma=Context.config['alg.noise_sigma'],
                    theta=Context.config['alg.noise_theta'])