Python OUNoise.noise Beispiele

Programmiersprache: Python

Namespace / Paketname: utils.ou_noise

Klasse / Typ: OUNoise

Methode / Funktion: noise

Beispiele auf hotexamples.com: 5

Python OUNoise.noise - 5 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die utils.ou_noise.OUNoise.noise, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

OUNoise(9)

noise(5)

reset(3)

sample(3)

Häufig verwendete Methoden

OUNoise (9)

noise (5)

reset (3)

sample (3)

Beispiel #1

Datei anzeigen

Datei: random_alg.py Projekt: RomanKoshelev/zoo

class RandomAlgorithm(TensorflowAlgorithm):
    def __init__(self, session, scope, obs_dim, act_dim):
        super(self.__class__, self).__init__(session, scope, obs_dim, act_dim)
        self.noise = OUNoise(act_dim,
                             mu=0,
                             sigma=Context.config['alg.noise_sigma'],
                             theta=Context.config['alg.noise_theta'])

    def predict(self, s):
        return self.noise.noise()

    def train(self, episodes, steps, **callbacks):
        raise NotImplementedError

    def _save_weights(self, path):
        pass

    def _restore_weights(self, path):
        pass

Beispiel #2

Datei anzeigen

def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
        if not flags.test:    
            # with tf.name_scope(v.name.replace(':0', '')):
            #     variable_summaries(v)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
        err_c_ph = tf.placeholder(tf.float32, [], name='err_c')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        tf.summary.scalar('err_h', err_h_ph)
        tf.summary.scalar('err_c', err_c_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    saver = tf.train.Saver(trainable_var, max_to_keep=3)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(long_path=False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)

        time.sleep(1.0)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'

        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))

        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        ref_action = [0., 0.]
        goal = [0., 0.]
        rnn_h_in = [np.zeros([1, flags.n_hidden]),
                    np.zeros([1, flags.n_hidden])] if flags.rnn_type == 'lstm' else np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_q = []
        epi_err_h = []
        epi_err_c = []
        loop_time = []
        action = [0., 0.]
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route))
            total_reward += reward

            if t > 0 and not (flags.supervision and (episode+1)%10 == 0):
                agent.Add2Mem([depth_stack, 
                               [combined_cmd], 
                               prev_a, 
                               local_next_goal, 
                               [rnn_h_in[0][0], rnn_h_in[1][0]] if flags.rnn_type == 'lstm' else rnn_h_in[0], 
                               action, 
                               reward, 
                               terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, rnn_h_out = agent.ActorPredict([depth_stack], [[combined_cmd]], [prev_a], rnn_h_in)
            action += (exploration_noise.noise() * np.asarray(agent.action_range))

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])
            
            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            if len(agent.memory) > agent.batch_size and not flags.test:
                q, err_h, err_c = agent.Train()
                epi_q.append(np.amax(q))
                epi_err_h.append(err_h)
                epi_err_c.append(err_c)
            
            if result >= 1:
                if len(agent.memory) > agent.batch_size and not flags.test:
                    summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                          q_ph: np.amax(q),
                                                          err_h_ph: np.mean(epi_err_h),
                                                          err_c_ph: np.mean(epi_err_c)})
                    summary_writer.add_summary(summary, T)
                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time))
                print info_train

                episode += 1
                T += 1
                break

            t += 1
            T += 1
            rnn_h_in = rnn_h_out
            rate.sleep()
            loop_time.append(time.time() - start_time)

Beispiel #3

Datei anzeigen

Datei: training_rdpg.py Projekt: xie9187/lan_nav

def main(robot_name, rviz):
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    # env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                mu=flags.mu,
                                theta=flags.theta,
                                sigma=flags.sigma)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        agent = RDPG(flags, sess)

        trainable_var = tf.trainable_variables()
        sess.run(tf.global_variables_initializer())

        model_dir = os.path.join(flags.model_dir, flags.model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        init_dir = os.path.join(flags.model_dir, flags.init_model_name)

        # summary
        if not flags.testing:
            print "  [*] printing trainable variables"
            for idx, v in enumerate(trainable_var):
                print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()),
                                                      v.name)
                # with tf.name_scope(v.name.replace(':0', '')):
                #     variable_summaries(v)

            reward_ph = tf.placeholder(tf.float32, [], name='reward')
            q_ph = tf.placeholder(tf.float32, [], name='q_pred')
            noise_ph = tf.placeholder(tf.float32, [], name='noise')
            epsilon_ph = tf.placeholder(tf.float32, [], name='epsilon')
            err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
            err_c_ph = tf.placeholder(tf.float32, [], name='err_c')

            tf.summary.scalar('reward', reward_ph)
            tf.summary.scalar('q_estimate', q_ph)
            tf.summary.scalar('noise', noise_ph)
            tf.summary.scalar('epsilon', epsilon_ph)
            tf.summary.scalar('err_h', err_h_ph)
            tf.summary.scalar('err_c', err_c_ph)
            merged = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

        part_var = []
        for idx, v in enumerate(trainable_var):
            if 'actor/online/encoder' in v.name or 'actor/online/controller' in v.name:
                part_var.append(v)
        saver = tf.train.Saver(trainable_var, max_to_keep=5)
        part_saver = tf.train.Saver(part_var, max_to_keep=5)

        # load model
        if 'empty' in flags.init_model_name:
            checkpoint = tf.train.get_checkpoint_state(model_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')
        else:
            checkpoint = tf.train.get_checkpoint_state(init_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                part_saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')

        episode = 0
        T = 0
        epsilon = flags.init_epsilon
        noise = flags.init_noise
        # start training

        room_position = np.array([-1, -1])
        while T < flags.total_steps and not rospy.is_shutdown():
            print ''
            # set robot in a room
            init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            )
            env.SetObjectPose(robot_name,
                              [init_pose[0], init_pose[1], 0., init_pose[2]])
            time.sleep(1.)
            # generate a path to other room
            target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
                room_position)
            # get a long path
            map_plan, real_route = world.GetPath(init_map_pose +
                                                 target_map_pose)

            dynamic_route = copy.deepcopy(real_route)
            env.LongPathPublish(real_route)
            time.sleep(1.)

            if len(map_plan) == 0:
                print 'no path'
                continue

            pose = env.GetSelfStateGT()
            target_table_point_list, cmd_list, check_point_list = world.GetCommandPlan(
                pose, real_route)

            # print 'target_table_point_list:', target_table_point_list
            print 'cmd_list:', cmd_list

            if len(target_table_point_list) == 0:
                print 'no check point'
                continue

            table_goal = target_table_point_list[0]
            goal = world.Table2RealPosition(table_goal)
            env.target_point = goal
            env.distance = np.sqrt(
                np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

            total_reward = 0
            laser = env.GetLaserObservation()
            laser_stack = np.stack([laser, laser, laser], axis=-1)
            action = [0., 0.]
            epi_q = []
            loop_time = []
            t = 0
            terminal = False
            result = 0
            cmd_idx = 0
            status = [0]
            prev_action = [0., 0.]
            err_h_epi, err_c_epi = 0., 0.
            status_cnt = 5
            if np.random.rand() <= epsilon:
                true_flag = True
                print '-------using true actions------'
            else:
                true_flag = False

            prev_state = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            prev_state_2 = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            while not rospy.is_shutdown():
                start_time = time.time()
                if t == 0:
                    print 'current cmd:', cmd_list[cmd_idx]

                terminal, result, reward = env.GetRewardAndTerminate(t)
                total_reward += reward

                if t > 0:
                    status = [1] if result == 1 else [0]
                    agent.Add2Mem(
                        (laser_stack, cmd, cmd_next, cmd_skip, prev_action,
                         local_goal, prev_state, prev_state_2, action, reward,
                         terminal, status, true_action))
                    prev_state = copy.deepcopy(state)
                    prev_state_2 = copy.deepcopy(state_2)
                    prev_action = copy.deepcopy(action)
                if result > 1:
                    break
                elif result == 1:
                    if cmd_list[cmd_idx] == 5:
                        print 'Finish!!!!!!!'
                        break
                    cmd_idx += 1
                    t = 0
                    status_cnt = 0
                    table_goal = target_table_point_list[cmd_idx]
                    goal = world.Table2RealPosition(table_goal)
                    env.target_point = goal
                    env.distance = np.sqrt(
                        np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

                    continue

                local_goal = env.GetLocalPoint(goal)
                env.PathPublish(local_goal)

                cmd = [cmd_list[cmd_idx]]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [cmd_list[cmd_idx + 1]]

                if status_cnt < 5:
                    cmd = [0]
                    cmd_skip = [0]
                else:
                    cmd_next = [0]
                status_cnt += 1

                # print '{}, {}, {}'.format(cmd[0], cmd_next[0], cmd_skip[0])

                laser = env.GetLaserObservation()
                laser_stack = np.stack(
                    [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

                if not flags.testing:
                    if noise > 0:
                        noise -= flags.init_noise / flags.noise_stop_episode
                        epsilon -= flags.init_epsilon / flags.noise_stop_episode

                time1 = time.time() - start_time

                # predict action
                if t > 0:
                    prev_action = action
                if cmd_list[cmd_idx] == 5:
                    local_goal_a = local_goal
                else:
                    local_goal_a = [0., 0.]
                # input_laser, input_cmd, input_cmd_next, prev_status, prev_action, input_goal, prev_state
                action, state, state_2 = agent.ActorPredict(
                    [laser_stack], [cmd], [cmd_next], [cmd_skip], [action],
                    [local_goal_a], prev_state, prev_state_2)

                if not flags.testing:
                    if T < flags.noise_stop_episode:
                        action += (exploration_noise.noise() *
                                   np.asarray(agent.action_range) * noise)

                # get true action
                pose = env.GetSelfStateGT()
                near_goal, dynamic_route = world.GetNextNearGoal(
                    dynamic_route, pose)

                local_near_goal = env.GetLocalPoint(near_goal)
                true_action = env.Controller(local_near_goal, None, 1)

                if true_flag:
                    action = true_action

                env.SelfControl(action, agent.action_range)

                time2 = time.time() - start_time - time1

                if T > agent.batch_size and not flags.testing:
                    q, err_h, err_c = agent.Train()
                    epi_q.append(np.amax(q))
                    err_h_epi += err_h
                    err_c_epi += err_c

                if (T + 1
                    ) % flags.steps_per_checkpoint == 0 and not flags.testing:
                    saver.save(sess,
                               os.path.join(model_dir, 'network'),
                               global_step=T + 1)

                t += 1
                T += 1
                loop_time.append(time.time() - start_time)
                time3 = time.time() - start_time - time1 - time2

                used_time = time.time() - start_time

                if used_time > 0.04:
                    print '{:.4f} | {:.4f} | {:.4f} | {:.4f}'.format(
                        time1, time2, time3, used_time)
                rate.sleep()

            if T > agent.batch_size and not flags.testing and t > 0:
                if not true_flag:
                    summary = sess.run(merged,
                                       feed_dict={
                                           reward_ph: total_reward,
                                           q_ph: np.amax(q),
                                           noise_ph: noise,
                                           epsilon_ph: epsilon,
                                           err_h_ph: err_h_epi / t,
                                           err_c_ph: err_c_epi / t
                                       })
                    summary_writer.add_summary(summary, T)
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Q:{:.2f} | Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.amax(q),
                    np.mean(loop_time))

            else:
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} |  Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.mean(loop_time))
            episode += 1

Beispiel #4

Datei anzeigen

Datei: training_no_tr.py Projekt: xie9187/lan_nav

def testing(sess, model):
    # --------------------load model-------------------------
    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    init_dir = os.path.join(flags.model_dir, flags.init_model_name)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    all_var = tf.global_variables()
    load_var = all_var[:21]
    trainable_var = tf.trainable_variables()
    part_var = []
    for idx, v in enumerate(load_var):
        print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
        if 'encoder' in v.name or 'controller' in v.name:
            part_var.append(v)
    saver = tf.train.Saver(trainable_var, max_to_keep=5)
    part_saver = tf.train.Saver(load_var, max_to_keep=1)

    # load model
    checkpoint = tf.train.get_checkpoint_state(model_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        part_saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Network model loaded: ", checkpoint.model_checkpoint_path)
    else:
        print('No model is found')

    # ----------------------init env-------------------------
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    robot_name = 'robot1'
    rviz = False
    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    init_pose = world.RandomInitPose()
    env.target_point = init_pose

    episode = 0
    T = 0

    # ------------------start to test------------------------
    room_position = np.array([-1, -1])
    SR_cnt = 0
    TIME_list = []
    POSE_list = []
    while not rospy.is_shutdown():
        print ''
        time.sleep(1.)
        # set robot in a room
        init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            room_position)
        env.SetObjectPose(robot_name,
                          [init_pose[0], init_pose[1], 0., init_pose[2]])
        # generate a path to other room
        target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
            room_position)
        # get a long path
        map_plan, real_route = world.GetPath(init_map_pose + target_map_pose)

        dynamic_route = copy.deepcopy(real_route)
        env.LongPathPublish(real_route)

        if len(map_plan) == 0:
            print 'no path'
            continue

        pose = env.GetSelfStateGT()
        target_table_point_list, cmd_list, checkpoint_list = world.GetCommandPlan(
            pose, real_route)
        cmd_list += [0]
        print 'target_table_point_list:', target_table_point_list
        print 'cmd_list:', cmd_list

        if len(target_table_point_list) == 0:
            print 'no check point'
            continue

        table_goal = target_table_point_list[0]
        goal = world.Table2RealPosition(table_goal)
        env.target_point = goal
        env.distance = np.sqrt(
            np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

        total_reward = 0
        laser = env.GetLaserObservation()
        laser_stack = np.stack([laser, laser, laser], axis=-1)
        prev_action = [0., 0.]
        epi_q = []
        loop_time = []
        t = 0
        terminal = False
        result = 0
        cmd_idx = 0
        prob = 0.
        used_action = [0., 0.]
        true_action = [0., 0.]
        logits = [0., 0.]
        cnt = 5
        status = 0
        prev_status = 0
        exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                    mu=flags.mu,
                                    theta=flags.theta,
                                    sigma=flags.sigma)
        epi_start_time = time.time()
        positions = []
        CMD_list = []
        while not rospy.is_shutdown():
            start_time = time.time()
            if t == 0:
                print 'current cmd:', cmd_list[cmd_idx]
            prev_result = result
            terminal, result, reward = env.GetRewardAndTerminate(t)
            total_reward += reward
            # if t > 0:
            #     print  '{}, {}, | {:.3f}, {:.3f} | {}, {}, {}'.format(result, status, logits[0], logits[1], cmd[0], cmd_next[0], cmd_skip[0])
            true_status = 0
            if result > 1:
                break
            # elif result == 1:
            #     true_status = 1
            #     if cmd_list[cmd_idx] == 5:
            #         print 'Finish!!!!!!!'
            #         break
            #     cmd_idx += 1
            #     t = 0
            #     table_goal = target_table_point_list[cmd_idx]
            #     goal = world.Table2RealPosition(table_goal)
            #     env.target_point = goal
            #     env.distance = np.sqrt(np.linalg.norm([pose[0]-goal[0], pose[1]-goal[1]]))
            #     continue

            local_goal = env.GetLocalPoint(goal)
            env.PathPublish(local_goal)

            if cnt < 5:
                cnt += 1
                cmd = [0]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [0]
            else:
                cmd = [cmd_list[cmd_idx]]
                cmd_next = [0]
                cmd_skip = [cmd_list[cmd_idx + 1]]
            CMD_list.append(cmd[0])
            curr_status = [cmd[0] * flags.n_cmd_type + cmd_next[0]]
            next_status = [cmd_next[0] * flags.n_cmd_type + cmd_skip[0]]
            prev_laser = laser
            laser = env.GetLaserObservation()
            if np.random.rand():
                pass
            laser_stack = np.stack(
                [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

            # predict action
            if cmd_list[cmd_idx] == 5:
                local_goal_a = local_goal
            else:
                local_goal_a = [0., 0.]

            prev_status = copy.deepcopy(status)
            status, action, logits, _ = model.Predict(
                [laser_stack], [curr_status], [next_status], [local_goal_a], t,
                [[used_action[0], used_action[1]]])
            action += (exploration_noise.noise() *
                       np.asarray(model.action_range) * 0.1)
            if prev_status == 0 and status == 1 and cnt >= 5:
                if cmd_list[cmd_idx] == 5:
                    print 'Finish!!!!!!!'
                    break
                cmd_idx += 1
                t = 0
                cnt = 0
                table_goal = target_table_point_list[cmd_idx]
                goal = world.Table2RealPosition(table_goal)
                env.target_point = goal
                env.distance = np.sqrt(
                    np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))
                continue

            # get action
            pose = env.GetSelfStateGT()
            near_goal, dynamic_route = world.GetNextNearGoal(
                dynamic_route, pose)

            local_near_goal = env.GetLocalPoint(near_goal)
            true_action = env.Controller(local_near_goal, None, 1)
            positions.append(pose[:2])
            # print action - np.asarray(true_action), action
            if cmd_list[cmd_idx] == 0:
                used_action = true_action
            else:
                used_action = action

            env.SelfControl(used_action,
                            [flags.a_linear_range, flags.a_angular_range])

            t += 1
            T += 1
            loop_time.append(time.time() - start_time)
            rate.sleep()

        print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:}'.format(
            episode, t, total_reward, T)
        episode += 1
        if cmd[0] == 5:
            SR_cnt += 1.0
            TIME_list.append(time.time() - epi_start_time)
            # LogData(positions, CMD_list, int(SR_cnt), os.path.join(CWD, 'experiments/positions'))
            print 'SR: {:.3f} | AVG TIME: {:.3f}, | MAX TIME: {:.3f}'.format(
                SR_cnt / episode, np.mean(TIME_list), np.amax(TIME_list))

Beispiel #5

Datei anzeigen

Datei: training_rdpg_BPTT.py Projekt: xie9187/visual_path_follow

def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG_BPTT(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    if not flags.test:
        saver = tf.train.Saver(max_to_keep=5, save_relative_paths=True)
    else:
        saver = tf.train.Saver(trainable_var)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    timeout_flag = False
    noise_annealing = 1.
    success_nums = np.zeros([10], dtype=np.float32)
    demo_lens = np.zeros([10], dtype=np.float32)
    results_nums = np.zeros([3], dtype=np.float32)
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            env.SetObjectPose(robot_name, [-1., -1., 0., 0.], once=True)
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)
        
        time.sleep(0.1)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq, cmd_list = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'
        
        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))
        
        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        action = [0., 0.]
        goal = [0., 0.]
        gru_h_in = np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_err_h = []
        loop_time = []
        data_seq = []
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route),
                                                                  test= True if flags.test else False)
            total_reward += reward

            if t > 0:
                data_seq.append([depth_stack, [cmd], prev_a, action, reward, terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            # add noise
            if flags.test:
                if cmd == 0:
                    cmd = 2
            elif cmd != 2 and np.random.rand() < 0.1:
                cmd = np.random.randint(4)

            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, gru_h_out = agent.ActorPredict([depth_stack], [[cmd]], [prev_a], gru_h_in)
            if flags.test:
                action += (exploration_noise.noise() * np.asarray(agent.action_range)) * noise_annealing

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])

            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            training_step_time = 0.
            if result >= 1:
                if not (flags.supervision and (episode+1)%10 == 0):
                    agent.Add2Mem(data_seq)

                if len(agent.memory) > agent.batch_size and not flags.test:
                    training_step_start_time = time.time()
                    q = agent.Train()
                    training_step_time = time.time() - training_step_start_time
                    if t > 1:
                        summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                              q_ph: np.amax(q)
                                                              })
                        summary_writer.add_summary(summary, T)

                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time)) + \
                             '| OpStepT(s): {:.3f}'.format(training_step_time)
                print info_train
                episode += 1
                demo_cnt = min(len(cmd_list), 10)
                demo_lens[demo_cnt-1] += 1
                if result == 2:
                    success_nums[demo_cnt-1] += 1
                    results_nums[0] += 1
                elif result in [3, 4]:
                    results_nums[2] += 1
                elif result == 1:
                    results_nums[1] += 1
                if flags.test and episode == 1000:
                    print 'success num distributs: ', success_nums
                    print 'demo length distributs: ', demo_lens
                    demo_lens[demo_lens==0] = 1e-12
                    print 'success rate distributs: ', success_nums/demo_lens
                    print 'results nums: ', results_nums
                    return True
                break

            t += 1
            T += 1
            noise_annealing -= 1/flags.max_training_step
            gru_h_in = gru_h_out
            loop_time.append(time.time() - start_time)
            rate.sleep()