Beispiel #1
0
class RandomAlgorithm(TensorflowAlgorithm):
    def __init__(self, session, scope, obs_dim, act_dim):
        super(self.__class__, self).__init__(session, scope, obs_dim, act_dim)
        self.noise = OUNoise(act_dim,
                             mu=0,
                             sigma=Context.config['alg.noise_sigma'],
                             theta=Context.config['alg.noise_theta'])

    def predict(self, s):
        return self.noise.noise()

    def train(self, episodes, steps, **callbacks):
        raise NotImplementedError

    def _save_weights(self, path):
        pass

    def _restore_weights(self, path):
        pass
Beispiel #2
0
def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
        if not flags.test:    
            # with tf.name_scope(v.name.replace(':0', '')):
            #     variable_summaries(v)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
        err_c_ph = tf.placeholder(tf.float32, [], name='err_c')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        tf.summary.scalar('err_h', err_h_ph)
        tf.summary.scalar('err_c', err_c_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    saver = tf.train.Saver(trainable_var, max_to_keep=3)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(long_path=False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)

        time.sleep(1.0)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'

        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))

        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        ref_action = [0., 0.]
        goal = [0., 0.]
        rnn_h_in = [np.zeros([1, flags.n_hidden]),
                    np.zeros([1, flags.n_hidden])] if flags.rnn_type == 'lstm' else np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_q = []
        epi_err_h = []
        epi_err_c = []
        loop_time = []
        action = [0., 0.]
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route))
            total_reward += reward

            if t > 0 and not (flags.supervision and (episode+1)%10 == 0):
                agent.Add2Mem([depth_stack, 
                               [combined_cmd], 
                               prev_a, 
                               local_next_goal, 
                               [rnn_h_in[0][0], rnn_h_in[1][0]] if flags.rnn_type == 'lstm' else rnn_h_in[0], 
                               action, 
                               reward, 
                               terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, rnn_h_out = agent.ActorPredict([depth_stack], [[combined_cmd]], [prev_a], rnn_h_in)
            action += (exploration_noise.noise() * np.asarray(agent.action_range))

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])
            
            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            if len(agent.memory) > agent.batch_size and not flags.test:
                q, err_h, err_c = agent.Train()
                epi_q.append(np.amax(q))
                epi_err_h.append(err_h)
                epi_err_c.append(err_c)
            
            if result >= 1:
                if len(agent.memory) > agent.batch_size and not flags.test:
                    summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                          q_ph: np.amax(q),
                                                          err_h_ph: np.mean(epi_err_h),
                                                          err_c_ph: np.mean(epi_err_c)})
                    summary_writer.add_summary(summary, T)
                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time))
                print info_train

                episode += 1
                T += 1
                break

            t += 1
            T += 1
            rnn_h_in = rnn_h_out
            rate.sleep()
            loop_time.append(time.time() - start_time)
Beispiel #3
0
def main(robot_name, rviz):
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    # env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                mu=flags.mu,
                                theta=flags.theta,
                                sigma=flags.sigma)

    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        agent = RDPG(flags, sess)

        trainable_var = tf.trainable_variables()
        sess.run(tf.global_variables_initializer())

        model_dir = os.path.join(flags.model_dir, flags.model_name)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        init_dir = os.path.join(flags.model_dir, flags.init_model_name)

        # summary
        if not flags.testing:
            print "  [*] printing trainable variables"
            for idx, v in enumerate(trainable_var):
                print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()),
                                                      v.name)
                # with tf.name_scope(v.name.replace(':0', '')):
                #     variable_summaries(v)

            reward_ph = tf.placeholder(tf.float32, [], name='reward')
            q_ph = tf.placeholder(tf.float32, [], name='q_pred')
            noise_ph = tf.placeholder(tf.float32, [], name='noise')
            epsilon_ph = tf.placeholder(tf.float32, [], name='epsilon')
            err_h_ph = tf.placeholder(tf.float32, [], name='err_h')
            err_c_ph = tf.placeholder(tf.float32, [], name='err_c')

            tf.summary.scalar('reward', reward_ph)
            tf.summary.scalar('q_estimate', q_ph)
            tf.summary.scalar('noise', noise_ph)
            tf.summary.scalar('epsilon', epsilon_ph)
            tf.summary.scalar('err_h', err_h_ph)
            tf.summary.scalar('err_c', err_c_ph)
            merged = tf.summary.merge_all()
            summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

        part_var = []
        for idx, v in enumerate(trainable_var):
            if 'actor/online/encoder' in v.name or 'actor/online/controller' in v.name:
                part_var.append(v)
        saver = tf.train.Saver(trainable_var, max_to_keep=5)
        part_saver = tf.train.Saver(part_var, max_to_keep=5)

        # load model
        if 'empty' in flags.init_model_name:
            checkpoint = tf.train.get_checkpoint_state(model_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')
        else:
            checkpoint = tf.train.get_checkpoint_state(init_dir)
            if checkpoint and checkpoint.model_checkpoint_path:
                part_saver.restore(sess, checkpoint.model_checkpoint_path)
                print("Network model loaded: ",
                      checkpoint.model_checkpoint_path)
            else:
                print('No model is found')

        episode = 0
        T = 0
        epsilon = flags.init_epsilon
        noise = flags.init_noise
        # start training

        room_position = np.array([-1, -1])
        while T < flags.total_steps and not rospy.is_shutdown():
            print ''
            # set robot in a room
            init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            )
            env.SetObjectPose(robot_name,
                              [init_pose[0], init_pose[1], 0., init_pose[2]])
            time.sleep(1.)
            # generate a path to other room
            target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
                room_position)
            # get a long path
            map_plan, real_route = world.GetPath(init_map_pose +
                                                 target_map_pose)

            dynamic_route = copy.deepcopy(real_route)
            env.LongPathPublish(real_route)
            time.sleep(1.)

            if len(map_plan) == 0:
                print 'no path'
                continue

            pose = env.GetSelfStateGT()
            target_table_point_list, cmd_list, check_point_list = world.GetCommandPlan(
                pose, real_route)

            # print 'target_table_point_list:', target_table_point_list
            print 'cmd_list:', cmd_list

            if len(target_table_point_list) == 0:
                print 'no check point'
                continue

            table_goal = target_table_point_list[0]
            goal = world.Table2RealPosition(table_goal)
            env.target_point = goal
            env.distance = np.sqrt(
                np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

            total_reward = 0
            laser = env.GetLaserObservation()
            laser_stack = np.stack([laser, laser, laser], axis=-1)
            action = [0., 0.]
            epi_q = []
            loop_time = []
            t = 0
            terminal = False
            result = 0
            cmd_idx = 0
            status = [0]
            prev_action = [0., 0.]
            err_h_epi, err_c_epi = 0., 0.
            status_cnt = 5
            if np.random.rand() <= epsilon:
                true_flag = True
                print '-------using true actions------'
            else:
                true_flag = False

            prev_state = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            prev_state_2 = (np.zeros(
                (1, agent.n_hidden)), np.zeros((1, agent.n_hidden)))
            while not rospy.is_shutdown():
                start_time = time.time()
                if t == 0:
                    print 'current cmd:', cmd_list[cmd_idx]

                terminal, result, reward = env.GetRewardAndTerminate(t)
                total_reward += reward

                if t > 0:
                    status = [1] if result == 1 else [0]
                    agent.Add2Mem(
                        (laser_stack, cmd, cmd_next, cmd_skip, prev_action,
                         local_goal, prev_state, prev_state_2, action, reward,
                         terminal, status, true_action))
                    prev_state = copy.deepcopy(state)
                    prev_state_2 = copy.deepcopy(state_2)
                    prev_action = copy.deepcopy(action)
                if result > 1:
                    break
                elif result == 1:
                    if cmd_list[cmd_idx] == 5:
                        print 'Finish!!!!!!!'
                        break
                    cmd_idx += 1
                    t = 0
                    status_cnt = 0
                    table_goal = target_table_point_list[cmd_idx]
                    goal = world.Table2RealPosition(table_goal)
                    env.target_point = goal
                    env.distance = np.sqrt(
                        np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

                    continue

                local_goal = env.GetLocalPoint(goal)
                env.PathPublish(local_goal)

                cmd = [cmd_list[cmd_idx]]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [cmd_list[cmd_idx + 1]]

                if status_cnt < 5:
                    cmd = [0]
                    cmd_skip = [0]
                else:
                    cmd_next = [0]
                status_cnt += 1

                # print '{}, {}, {}'.format(cmd[0], cmd_next[0], cmd_skip[0])

                laser = env.GetLaserObservation()
                laser_stack = np.stack(
                    [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

                if not flags.testing:
                    if noise > 0:
                        noise -= flags.init_noise / flags.noise_stop_episode
                        epsilon -= flags.init_epsilon / flags.noise_stop_episode

                time1 = time.time() - start_time

                # predict action
                if t > 0:
                    prev_action = action
                if cmd_list[cmd_idx] == 5:
                    local_goal_a = local_goal
                else:
                    local_goal_a = [0., 0.]
                # input_laser, input_cmd, input_cmd_next, prev_status, prev_action, input_goal, prev_state
                action, state, state_2 = agent.ActorPredict(
                    [laser_stack], [cmd], [cmd_next], [cmd_skip], [action],
                    [local_goal_a], prev_state, prev_state_2)

                if not flags.testing:
                    if T < flags.noise_stop_episode:
                        action += (exploration_noise.noise() *
                                   np.asarray(agent.action_range) * noise)

                # get true action
                pose = env.GetSelfStateGT()
                near_goal, dynamic_route = world.GetNextNearGoal(
                    dynamic_route, pose)

                local_near_goal = env.GetLocalPoint(near_goal)
                true_action = env.Controller(local_near_goal, None, 1)

                if true_flag:
                    action = true_action

                env.SelfControl(action, agent.action_range)

                time2 = time.time() - start_time - time1

                if T > agent.batch_size and not flags.testing:
                    q, err_h, err_c = agent.Train()
                    epi_q.append(np.amax(q))
                    err_h_epi += err_h
                    err_c_epi += err_c

                if (T + 1
                    ) % flags.steps_per_checkpoint == 0 and not flags.testing:
                    saver.save(sess,
                               os.path.join(model_dir, 'network'),
                               global_step=T + 1)

                t += 1
                T += 1
                loop_time.append(time.time() - start_time)
                time3 = time.time() - start_time - time1 - time2

                used_time = time.time() - start_time

                if used_time > 0.04:
                    print '{:.4f} | {:.4f} | {:.4f} | {:.4f}'.format(
                        time1, time2, time3, used_time)
                rate.sleep()

            if T > agent.batch_size and not flags.testing and t > 0:
                if not true_flag:
                    summary = sess.run(merged,
                                       feed_dict={
                                           reward_ph: total_reward,
                                           q_ph: np.amax(q),
                                           noise_ph: noise,
                                           epsilon_ph: epsilon,
                                           err_h_ph: err_h_epi / t,
                                           err_c_ph: err_c_epi / t
                                       })
                    summary_writer.add_summary(summary, T)
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Q:{:.2f} | Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.amax(q),
                    np.mean(loop_time))

            else:
                print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} |  Loop Time:{:.3f} '.format(
                    episode, t, total_reward, T, np.mean(loop_time))
            episode += 1
Beispiel #4
0
def testing(sess, model):
    # --------------------load model-------------------------
    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    init_dir = os.path.join(flags.model_dir, flags.init_model_name)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)
    all_var = tf.global_variables()
    load_var = all_var[:21]
    trainable_var = tf.trainable_variables()
    part_var = []
    for idx, v in enumerate(load_var):
        print("  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name))
        if 'encoder' in v.name or 'controller' in v.name:
            part_var.append(v)
    saver = tf.train.Saver(trainable_var, max_to_keep=5)
    part_saver = tf.train.Saver(load_var, max_to_keep=1)

    # load model
    checkpoint = tf.train.get_checkpoint_state(model_dir)
    if checkpoint and checkpoint.model_checkpoint_path:
        part_saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Network model loaded: ", checkpoint.model_checkpoint_path)
    else:
        print('No model is found')

    # ----------------------init env-------------------------
    # np.random.seed(RANDOM_SEED)
    tf.set_random_seed(RANDOM_SEED)
    world = GridWorld()
    switch_action = False
    # world.map, switch_action = world.RandomSwitchRoom()
    world.GetAugMap()
    world.CreateTable()
    cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255)

    FileProcess()

    robot_name = 'robot1'
    rviz = False
    env = GazeboWorld(world.table, robot_name, rviz=rviz)
    print "Env initialized"
    time.sleep(2.)

    rate = rospy.Rate(5.)

    pickle_path = os.path.join(CWD, 'world/model_states_data.p')

    env.GetModelStates()
    env.ResetWorld()
    env.ResetModelsPose(pickle_path)
    if switch_action:
        env.SwitchRoom(switch_action)

    env.state_call_back_flag = True
    env.target_theta_range = 1.
    time.sleep(2.)

    init_pose = world.RandomInitPose()
    env.target_point = init_pose

    episode = 0
    T = 0

    # ------------------start to test------------------------
    room_position = np.array([-1, -1])
    SR_cnt = 0
    TIME_list = []
    POSE_list = []
    while not rospy.is_shutdown():
        print ''
        time.sleep(1.)
        # set robot in a room
        init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom(
            room_position)
        env.SetObjectPose(robot_name,
                          [init_pose[0], init_pose[1], 0., init_pose[2]])
        # generate a path to other room
        target_pose, target_map_pose, _ = world.RandomInitPoseInRoom(
            room_position)
        # get a long path
        map_plan, real_route = world.GetPath(init_map_pose + target_map_pose)

        dynamic_route = copy.deepcopy(real_route)
        env.LongPathPublish(real_route)

        if len(map_plan) == 0:
            print 'no path'
            continue

        pose = env.GetSelfStateGT()
        target_table_point_list, cmd_list, checkpoint_list = world.GetCommandPlan(
            pose, real_route)
        cmd_list += [0]
        print 'target_table_point_list:', target_table_point_list
        print 'cmd_list:', cmd_list

        if len(target_table_point_list) == 0:
            print 'no check point'
            continue

        table_goal = target_table_point_list[0]
        goal = world.Table2RealPosition(table_goal)
        env.target_point = goal
        env.distance = np.sqrt(
            np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))

        total_reward = 0
        laser = env.GetLaserObservation()
        laser_stack = np.stack([laser, laser, laser], axis=-1)
        prev_action = [0., 0.]
        epi_q = []
        loop_time = []
        t = 0
        terminal = False
        result = 0
        cmd_idx = 0
        prob = 0.
        used_action = [0., 0.]
        true_action = [0., 0.]
        logits = [0., 0.]
        cnt = 5
        status = 0
        prev_status = 0
        exploration_noise = OUNoise(action_dimension=flags.dim_action,
                                    mu=flags.mu,
                                    theta=flags.theta,
                                    sigma=flags.sigma)
        epi_start_time = time.time()
        positions = []
        CMD_list = []
        while not rospy.is_shutdown():
            start_time = time.time()
            if t == 0:
                print 'current cmd:', cmd_list[cmd_idx]
            prev_result = result
            terminal, result, reward = env.GetRewardAndTerminate(t)
            total_reward += reward
            # if t > 0:
            #     print  '{}, {}, | {:.3f}, {:.3f} | {}, {}, {}'.format(result, status, logits[0], logits[1], cmd[0], cmd_next[0], cmd_skip[0])
            true_status = 0
            if result > 1:
                break
            # elif result == 1:
            #     true_status = 1
            #     if cmd_list[cmd_idx] == 5:
            #         print 'Finish!!!!!!!'
            #         break
            #     cmd_idx += 1
            #     t = 0
            #     table_goal = target_table_point_list[cmd_idx]
            #     goal = world.Table2RealPosition(table_goal)
            #     env.target_point = goal
            #     env.distance = np.sqrt(np.linalg.norm([pose[0]-goal[0], pose[1]-goal[1]]))
            #     continue

            local_goal = env.GetLocalPoint(goal)
            env.PathPublish(local_goal)

            if cnt < 5:
                cnt += 1
                cmd = [0]
                cmd_next = [cmd_list[cmd_idx]]
                cmd_skip = [0]
            else:
                cmd = [cmd_list[cmd_idx]]
                cmd_next = [0]
                cmd_skip = [cmd_list[cmd_idx + 1]]
            CMD_list.append(cmd[0])
            curr_status = [cmd[0] * flags.n_cmd_type + cmd_next[0]]
            next_status = [cmd_next[0] * flags.n_cmd_type + cmd_skip[0]]
            prev_laser = laser
            laser = env.GetLaserObservation()
            if np.random.rand():
                pass
            laser_stack = np.stack(
                [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1)

            # predict action
            if cmd_list[cmd_idx] == 5:
                local_goal_a = local_goal
            else:
                local_goal_a = [0., 0.]

            prev_status = copy.deepcopy(status)
            status, action, logits, _ = model.Predict(
                [laser_stack], [curr_status], [next_status], [local_goal_a], t,
                [[used_action[0], used_action[1]]])
            action += (exploration_noise.noise() *
                       np.asarray(model.action_range) * 0.1)
            if prev_status == 0 and status == 1 and cnt >= 5:
                if cmd_list[cmd_idx] == 5:
                    print 'Finish!!!!!!!'
                    break
                cmd_idx += 1
                t = 0
                cnt = 0
                table_goal = target_table_point_list[cmd_idx]
                goal = world.Table2RealPosition(table_goal)
                env.target_point = goal
                env.distance = np.sqrt(
                    np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]]))
                continue

            # get action
            pose = env.GetSelfStateGT()
            near_goal, dynamic_route = world.GetNextNearGoal(
                dynamic_route, pose)

            local_near_goal = env.GetLocalPoint(near_goal)
            true_action = env.Controller(local_near_goal, None, 1)
            positions.append(pose[:2])
            # print action - np.asarray(true_action), action
            if cmd_list[cmd_idx] == 0:
                used_action = true_action
            else:
                used_action = action

            env.SelfControl(used_action,
                            [flags.a_linear_range, flags.a_angular_range])

            t += 1
            T += 1
            loop_time.append(time.time() - start_time)
            rate.sleep()

        print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:}'.format(
            episode, t, total_reward, T)
        episode += 1
        if cmd[0] == 5:
            SR_cnt += 1.0
            TIME_list.append(time.time() - epi_start_time)
            # LogData(positions, CMD_list, int(SR_cnt), os.path.join(CWD, 'experiments/positions'))
            print 'SR: {:.3f} | AVG TIME: {:.3f}, | MAX TIME: {:.3f}'.format(
                SR_cnt / episode, np.mean(TIME_list), np.amax(TIME_list))
def main(sess, robot_name='robot1'):
    # init environment
    env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], 
                                depth_size=[flags.dim_depth_w, flags.dim_depth_h])
    world = GridWorld()
    cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255)

    FileProcess()
    print "Env initialized"

    exploration_noise = OUNoise(action_dimension=flags.dim_action, 
                                mu=flags.mu, theta=flags.theta, sigma=flags.sigma)
    agent = RDPG_BPTT(flags, sess)

    trainable_var = tf.trainable_variables()

    model_dir = os.path.join(flags.model_dir, flags.model_name)
    if not os.path.exists(model_dir): 
        os.makedirs(model_dir)
    # summary
    print "  [*] printing trainable variables"
    for idx, v in enumerate(trainable_var):
        print "  var {:3}: {:15}   {}".format(idx, str(v.get_shape()), v.name)
    if not flags.test:
        reward_ph = tf.placeholder(tf.float32, [], name='reward')
        q_ph = tf.placeholder(tf.float32, [], name='q_pred')
        tf.summary.scalar('reward', reward_ph)
        tf.summary.scalar('q_estimate', q_ph)
        merged = tf.summary.merge_all()
        summary_writer = tf.summary.FileWriter(model_dir, sess.graph)

    # model saver
    if not flags.test:
        saver = tf.train.Saver(max_to_keep=5, save_relative_paths=True)
    else:
        saver = tf.train.Saver(trainable_var)
    sess.run(tf.global_variables_initializer())

    if flags.test or flags.load_network:
        checkpoint = tf.train.get_checkpoint_state(model_dir)
        if checkpoint and checkpoint.model_checkpoint_path:
            saver.restore(sess, checkpoint.model_checkpoint_path)
            print 'model loaded: ', checkpoint.model_checkpoint_path 
        else:
            print 'model not found'

    rate = rospy.Rate(5.)
    T = 0
    episode = 0

    # start learning
    training_start_time = time.time()
    timeout_flag = False
    noise_annealing = 1.
    success_nums = np.zeros([10], dtype=np.float32)
    demo_lens = np.zeros([10], dtype=np.float32)
    results_nums = np.zeros([3], dtype=np.float32)
    while not rospy.is_shutdown() and T < flags.max_training_step:
        time.sleep(1.)
        if episode % 40 == 0 or timeout_flag:
            print 'randomising the environment'
            env.SetObjectPose(robot_name, [-1., -1., 0., 0.], once=True)
            world.RandomTableAndMap()
            world.GetAugMap()
            obj_list = env.GetModelStates()
            obj_pose_dict = world.AllocateObject(obj_list)
            for name in obj_pose_dict:
                env.SetObjectPose(name, obj_pose_dict[name])
            time.sleep(1.)
            print 'randomisation finished'

        try:
            table_route, map_route, real_route, init_pose = world.RandomPath(False)
            timeout_flag = False
        except:
            timeout_flag = True
            print 'random path timeout'
            continue
        env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True)
        
        time.sleep(0.1)
        dynamic_route = copy.deepcopy(real_route)
        time.sleep(0.1)

        cmd_seq, goal_seq, cmd_list = world.GetCmdAndGoalSeq(table_route)
        pose = env.GetSelfStateGT()
        cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.])
        try:
            local_next_goal = env.Global2Local([next_goal], pose)[0]
        except Exception as e:
            print 'next goal error'
        
        env.last_target_point = copy.deepcopy(env.target_point)
        env.target_point = next_goal
        env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]]))
        
        depth_img = env.GetDepthImageObservation()
        depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1)
        action = [0., 0.]
        goal = [0., 0.]
        gru_h_in = np.zeros([1, flags.n_hidden])

        total_reward = 0
        epi_err_h = []
        loop_time = []
        data_seq = []
        t = 0
        terminate = False
        while not rospy.is_shutdown():
            start_time = time.time()

            terminate, result, reward = env.GetRewardAndTerminate(t, 
                                                                  max_step=flags.max_epi_step, 
                                                                  len_route=len(dynamic_route),
                                                                  test= True if flags.test else False)
            total_reward += reward

            if t > 0:
                data_seq.append([depth_stack, [cmd], prev_a, action, reward, terminate])

            rgb_image = env.GetRGBImageObservation()
            depth_img = env.GetDepthImageObservation()
            depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1)

            pose = env.GetSelfStateGT()
            try:
                near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose)
                env.LongPathPublish(dynamic_route)
            except:
                pass
            prev_cmd = cmd
            prev_last_cmd = last_cmd
            prev_goal = next_goal
            cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, 
                                                           cmd_seq, 
                                                           goal_seq, 
                                                           pose, 
                                                           prev_cmd,
                                                           prev_last_cmd, 
                                                           prev_goal)
            # add noise
            if flags.test:
                if cmd == 0:
                    cmd = 2
            elif cmd != 2 and np.random.rand() < 0.1:
                cmd = np.random.randint(4)

            combined_cmd = last_cmd * flags.n_cmd_type + cmd
            env.last_target_point = copy.deepcopy(env.target_point)
            env.target_point = next_goal
            local_next_goal = env.Global2Local([next_goal], pose)[0]
            env.PathPublish(local_next_goal)
            env.CommandPublish(cmd)

            prev_a = copy.deepcopy(action)
            action, gru_h_out = agent.ActorPredict([depth_stack], [[cmd]], [prev_a], gru_h_in)
            if flags.test:
                action += (exploration_noise.noise() * np.asarray(agent.action_range)) * noise_annealing

            if flags.supervision and (episode+1)%10 != 0:
                local_near_goal = env.GetLocalPoint(near_goal)
                action = env.Controller(local_near_goal, None, 1)

            env.SelfControl(action, [0.3, np.pi/6])

            if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test:
                saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode)

            training_step_time = 0.
            if result >= 1:
                if not (flags.supervision and (episode+1)%10 == 0):
                    agent.Add2Mem(data_seq)

                if len(agent.memory) > agent.batch_size and not flags.test:
                    training_step_start_time = time.time()
                    q = agent.Train()
                    training_step_time = time.time() - training_step_start_time
                    if t > 1:
                        summary = sess.run(merged, feed_dict={reward_ph: total_reward,
                                                              q_ph: np.amax(q)
                                                              })
                        summary_writer.add_summary(summary, T)

                info_train = '| Episode:{:3d}'.format(episode) + \
                             '| t:{:3d}'.format(t) + \
                             '| T:{:5d}'.format(T) + \
                             '| Reward:{:.3f}'.format(total_reward) + \
                             '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \
                             '| LoopTime(s): {:.3f}'.format(np.mean(loop_time)) + \
                             '| OpStepT(s): {:.3f}'.format(training_step_time)
                print info_train
                episode += 1
                demo_cnt = min(len(cmd_list), 10)
                demo_lens[demo_cnt-1] += 1
                if result == 2:
                    success_nums[demo_cnt-1] += 1
                    results_nums[0] += 1
                elif result in [3, 4]:
                    results_nums[2] += 1
                elif result == 1:
                    results_nums[1] += 1
                if flags.test and episode == 1000:
                    print 'success num distributs: ', success_nums
                    print 'demo length distributs: ', demo_lens
                    demo_lens[demo_lens==0] = 1e-12
                    print 'success rate distributs: ', success_nums/demo_lens
                    print 'results nums: ', results_nums
                    return True
                break

            t += 1
            T += 1
            noise_annealing -= 1/flags.max_training_step
            gru_h_in = gru_h_out
            loop_time.append(time.time() - start_time)
            rate.sleep()