def main(sess, robot_name='robot1'): # init environment env = GazeboWorld(robot_name, rgb_size=[flags.dim_rgb_w, flags.dim_rgb_h], depth_size=[flags.dim_depth_w, flags.dim_depth_h]) world = GridWorld() cv2.imwrite('./world/map.png', np.flipud(1-world.map)*255) FileProcess() print "Env initialized" exploration_noise = OUNoise(action_dimension=flags.dim_action, mu=flags.mu, theta=flags.theta, sigma=flags.sigma) agent = RDPG(flags, sess) trainable_var = tf.trainable_variables() model_dir = os.path.join(flags.model_dir, flags.model_name) if not os.path.exists(model_dir): os.makedirs(model_dir) # summary print " [*] printing trainable variables" for idx, v in enumerate(trainable_var): print " var {:3}: {:15} {}".format(idx, str(v.get_shape()), v.name) if not flags.test: # with tf.name_scope(v.name.replace(':0', '')): # variable_summaries(v) if not flags.test: reward_ph = tf.placeholder(tf.float32, [], name='reward') q_ph = tf.placeholder(tf.float32, [], name='q_pred') err_h_ph = tf.placeholder(tf.float32, [], name='err_h') err_c_ph = tf.placeholder(tf.float32, [], name='err_c') tf.summary.scalar('reward', reward_ph) tf.summary.scalar('q_estimate', q_ph) tf.summary.scalar('err_h', err_h_ph) tf.summary.scalar('err_c', err_c_ph) merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(model_dir, sess.graph) # model saver saver = tf.train.Saver(trainable_var, max_to_keep=3) sess.run(tf.global_variables_initializer()) if flags.test or flags.load_network: checkpoint = tf.train.get_checkpoint_state(model_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print 'model loaded: ', checkpoint.model_checkpoint_path else: print 'model not found' rate = rospy.Rate(5.) T = 0 episode = 0 # start learning training_start_time = time.time() while not rospy.is_shutdown() and T < flags.max_training_step: time.sleep(1.) if episode % 40 == 0 or timeout_flag: print 'randomising the environment' world.RandomTableAndMap() world.GetAugMap() obj_list = env.GetModelStates() obj_pose_dict = world.AllocateObject(obj_list) for name in obj_pose_dict: env.SetObjectPose(name, obj_pose_dict[name]) time.sleep(1.) print 'randomisation finished' try: table_route, map_route, real_route, init_pose = world.RandomPath(long_path=False) timeout_flag = False except: timeout_flag = True print 'random path timeout' continue env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]], once=True) time.sleep(1.0) dynamic_route = copy.deepcopy(real_route) time.sleep(0.1) cmd_seq, goal_seq = world.GetCmdAndGoalSeq(table_route) pose = env.GetSelfStateGT() cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, 2, 2, [0., 0.]) try: local_next_goal = env.Global2Local([next_goal], pose)[0] except Exception as e: print 'next goal error' env.last_target_point = copy.deepcopy(env.target_point) env.target_point = next_goal env.distance = np.sqrt(np.linalg.norm([pose[0]-local_next_goal[0], local_next_goal[1]-local_next_goal[1]])) depth_img = env.GetDepthImageObservation() depth_stack = np.stack([depth_img, depth_img, depth_img], axis=-1) ref_action = [0., 0.] goal = [0., 0.] rnn_h_in = [np.zeros([1, flags.n_hidden]), np.zeros([1, flags.n_hidden])] if flags.rnn_type == 'lstm' else np.zeros([1, flags.n_hidden]) total_reward = 0 epi_q = [] epi_err_h = [] epi_err_c = [] loop_time = [] action = [0., 0.] t = 0 terminate = False while not rospy.is_shutdown(): start_time = time.time() terminate, result, reward = env.GetRewardAndTerminate(t, max_step=flags.max_epi_step, len_route=len(dynamic_route)) total_reward += reward if t > 0 and not (flags.supervision and (episode+1)%10 == 0): agent.Add2Mem([depth_stack, [combined_cmd], prev_a, local_next_goal, [rnn_h_in[0][0], rnn_h_in[1][0]] if flags.rnn_type == 'lstm' else rnn_h_in[0], action, reward, terminate]) rgb_image = env.GetRGBImageObservation() depth_img = env.GetDepthImageObservation() depth_stack = np.stack([depth_img, depth_stack[:, :, 0], depth_stack[:, :, 1]], axis=-1) pose = env.GetSelfStateGT() try: near_goal, dynamic_route = world.GetNextNearGoal(dynamic_route, pose) env.LongPathPublish(dynamic_route) except: pass prev_cmd = cmd prev_last_cmd = last_cmd prev_goal = next_goal cmd, last_cmd, next_goal = world.GetCmdAndGoal(table_route, cmd_seq, goal_seq, pose, prev_cmd, prev_last_cmd, prev_goal) combined_cmd = last_cmd * flags.n_cmd_type + cmd env.last_target_point = copy.deepcopy(env.target_point) env.target_point = next_goal local_next_goal = env.Global2Local([next_goal], pose)[0] env.PathPublish(local_next_goal) env.CommandPublish(cmd) prev_a = copy.deepcopy(action) action, rnn_h_out = agent.ActorPredict([depth_stack], [[combined_cmd]], [prev_a], rnn_h_in) action += (exploration_noise.noise() * np.asarray(agent.action_range)) if flags.supervision and (episode+1)%10 != 0: local_near_goal = env.GetLocalPoint(near_goal) action = env.Controller(local_near_goal, None, 1) env.SelfControl(action, [0.3, np.pi/6]) if (T + 1) % flags.steps_per_checkpoint == 0 and not flags.test: saver.save(sess, os.path.join(model_dir, 'network') , global_step=episode) if len(agent.memory) > agent.batch_size and not flags.test: q, err_h, err_c = agent.Train() epi_q.append(np.amax(q)) epi_err_h.append(err_h) epi_err_c.append(err_c) if result >= 1: if len(agent.memory) > agent.batch_size and not flags.test: summary = sess.run(merged, feed_dict={reward_ph: total_reward, q_ph: np.amax(q), err_h_ph: np.mean(epi_err_h), err_c_ph: np.mean(epi_err_c)}) summary_writer.add_summary(summary, T) info_train = '| Episode:{:3d}'.format(episode) + \ '| t:{:3d}'.format(t) + \ '| T:{:5d}'.format(T) + \ '| Reward:{:.3f}'.format(total_reward) + \ '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \ '| LoopTime(s): {:.3f}'.format(np.mean(loop_time)) print info_train episode += 1 T += 1 break t += 1 T += 1 rnn_h_in = rnn_h_out rate.sleep() loop_time.append(time.time() - start_time)
def main(robot_name, rviz): # np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) world = GridWorld() switch_action = False # world.map, switch_action = world.RandomSwitchRoom() world.GetAugMap() world.CreateTable() cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255) FileProcess() env = GazeboWorld(world.table, robot_name, rviz=rviz) print "Env initialized" time.sleep(2.) rate = rospy.Rate(5.) pickle_path = os.path.join(CWD, 'world/model_states_data.p') env.GetModelStates() env.ResetWorld() # env.ResetModelsPose(pickle_path) if switch_action: env.SwitchRoom(switch_action) env.state_call_back_flag = True env.target_theta_range = 1. time.sleep(2.) exploration_noise = OUNoise(action_dimension=flags.dim_action, mu=flags.mu, theta=flags.theta, sigma=flags.sigma) config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: agent = RDPG(flags, sess) trainable_var = tf.trainable_variables() sess.run(tf.global_variables_initializer()) model_dir = os.path.join(flags.model_dir, flags.model_name) if not os.path.exists(model_dir): os.makedirs(model_dir) init_dir = os.path.join(flags.model_dir, flags.init_model_name) # summary if not flags.testing: print " [*] printing trainable variables" for idx, v in enumerate(trainable_var): print " var {:3}: {:15} {}".format(idx, str(v.get_shape()), v.name) # with tf.name_scope(v.name.replace(':0', '')): # variable_summaries(v) reward_ph = tf.placeholder(tf.float32, [], name='reward') q_ph = tf.placeholder(tf.float32, [], name='q_pred') noise_ph = tf.placeholder(tf.float32, [], name='noise') epsilon_ph = tf.placeholder(tf.float32, [], name='epsilon') err_h_ph = tf.placeholder(tf.float32, [], name='err_h') err_c_ph = tf.placeholder(tf.float32, [], name='err_c') tf.summary.scalar('reward', reward_ph) tf.summary.scalar('q_estimate', q_ph) tf.summary.scalar('noise', noise_ph) tf.summary.scalar('epsilon', epsilon_ph) tf.summary.scalar('err_h', err_h_ph) tf.summary.scalar('err_c', err_c_ph) merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(model_dir, sess.graph) part_var = [] for idx, v in enumerate(trainable_var): if 'actor/online/encoder' in v.name or 'actor/online/controller' in v.name: part_var.append(v) saver = tf.train.Saver(trainable_var, max_to_keep=5) part_saver = tf.train.Saver(part_var, max_to_keep=5) # load model if 'empty' in flags.init_model_name: checkpoint = tf.train.get_checkpoint_state(model_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print("Network model loaded: ", checkpoint.model_checkpoint_path) else: print('No model is found') else: checkpoint = tf.train.get_checkpoint_state(init_dir) if checkpoint and checkpoint.model_checkpoint_path: part_saver.restore(sess, checkpoint.model_checkpoint_path) print("Network model loaded: ", checkpoint.model_checkpoint_path) else: print('No model is found') episode = 0 T = 0 epsilon = flags.init_epsilon noise = flags.init_noise # start training room_position = np.array([-1, -1]) while T < flags.total_steps and not rospy.is_shutdown(): print '' # set robot in a room init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom( ) env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]]) time.sleep(1.) # generate a path to other room target_pose, target_map_pose, _ = world.RandomInitPoseInRoom( room_position) # get a long path map_plan, real_route = world.GetPath(init_map_pose + target_map_pose) dynamic_route = copy.deepcopy(real_route) env.LongPathPublish(real_route) time.sleep(1.) if len(map_plan) == 0: print 'no path' continue pose = env.GetSelfStateGT() target_table_point_list, cmd_list, check_point_list = world.GetCommandPlan( pose, real_route) # print 'target_table_point_list:', target_table_point_list print 'cmd_list:', cmd_list if len(target_table_point_list) == 0: print 'no check point' continue table_goal = target_table_point_list[0] goal = world.Table2RealPosition(table_goal) env.target_point = goal env.distance = np.sqrt( np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]])) total_reward = 0 laser = env.GetLaserObservation() laser_stack = np.stack([laser, laser, laser], axis=-1) action = [0., 0.] epi_q = [] loop_time = [] t = 0 terminal = False result = 0 cmd_idx = 0 status = [0] prev_action = [0., 0.] err_h_epi, err_c_epi = 0., 0. status_cnt = 5 if np.random.rand() <= epsilon: true_flag = True print '-------using true actions------' else: true_flag = False prev_state = (np.zeros( (1, agent.n_hidden)), np.zeros((1, agent.n_hidden))) prev_state_2 = (np.zeros( (1, agent.n_hidden)), np.zeros((1, agent.n_hidden))) while not rospy.is_shutdown(): start_time = time.time() if t == 0: print 'current cmd:', cmd_list[cmd_idx] terminal, result, reward = env.GetRewardAndTerminate(t) total_reward += reward if t > 0: status = [1] if result == 1 else [0] agent.Add2Mem( (laser_stack, cmd, cmd_next, cmd_skip, prev_action, local_goal, prev_state, prev_state_2, action, reward, terminal, status, true_action)) prev_state = copy.deepcopy(state) prev_state_2 = copy.deepcopy(state_2) prev_action = copy.deepcopy(action) if result > 1: break elif result == 1: if cmd_list[cmd_idx] == 5: print 'Finish!!!!!!!' break cmd_idx += 1 t = 0 status_cnt = 0 table_goal = target_table_point_list[cmd_idx] goal = world.Table2RealPosition(table_goal) env.target_point = goal env.distance = np.sqrt( np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]])) continue local_goal = env.GetLocalPoint(goal) env.PathPublish(local_goal) cmd = [cmd_list[cmd_idx]] cmd_next = [cmd_list[cmd_idx]] cmd_skip = [cmd_list[cmd_idx + 1]] if status_cnt < 5: cmd = [0] cmd_skip = [0] else: cmd_next = [0] status_cnt += 1 # print '{}, {}, {}'.format(cmd[0], cmd_next[0], cmd_skip[0]) laser = env.GetLaserObservation() laser_stack = np.stack( [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1) if not flags.testing: if noise > 0: noise -= flags.init_noise / flags.noise_stop_episode epsilon -= flags.init_epsilon / flags.noise_stop_episode time1 = time.time() - start_time # predict action if t > 0: prev_action = action if cmd_list[cmd_idx] == 5: local_goal_a = local_goal else: local_goal_a = [0., 0.] # input_laser, input_cmd, input_cmd_next, prev_status, prev_action, input_goal, prev_state action, state, state_2 = agent.ActorPredict( [laser_stack], [cmd], [cmd_next], [cmd_skip], [action], [local_goal_a], prev_state, prev_state_2) if not flags.testing: if T < flags.noise_stop_episode: action += (exploration_noise.noise() * np.asarray(agent.action_range) * noise) # get true action pose = env.GetSelfStateGT() near_goal, dynamic_route = world.GetNextNearGoal( dynamic_route, pose) local_near_goal = env.GetLocalPoint(near_goal) true_action = env.Controller(local_near_goal, None, 1) if true_flag: action = true_action env.SelfControl(action, agent.action_range) time2 = time.time() - start_time - time1 if T > agent.batch_size and not flags.testing: q, err_h, err_c = agent.Train() epi_q.append(np.amax(q)) err_h_epi += err_h err_c_epi += err_c if (T + 1 ) % flags.steps_per_checkpoint == 0 and not flags.testing: saver.save(sess, os.path.join(model_dir, 'network'), global_step=T + 1) t += 1 T += 1 loop_time.append(time.time() - start_time) time3 = time.time() - start_time - time1 - time2 used_time = time.time() - start_time if used_time > 0.04: print '{:.4f} | {:.4f} | {:.4f} | {:.4f}'.format( time1, time2, time3, used_time) rate.sleep() if T > agent.batch_size and not flags.testing and t > 0: if not true_flag: summary = sess.run(merged, feed_dict={ reward_ph: total_reward, q_ph: np.amax(q), noise_ph: noise, epsilon_ph: epsilon, err_h_ph: err_h_epi / t, err_c_ph: err_c_epi / t }) summary_writer.add_summary(summary, T) print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Q:{:.2f} | Loop Time:{:.3f} '.format( episode, t, total_reward, T, np.amax(q), np.mean(loop_time)) else: print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Loop Time:{:.3f} '.format( episode, t, total_reward, T, np.mean(loop_time)) episode += 1
def il_training(sess, model): # init network model_dir = os.path.join(flags.model_dir, flags.model_name) if not os.path.exists(model_dir): os.makedirs(model_dir) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) trainable_var = tf.trainable_variables() part_var = [] for idx, v in enumerate(trainable_var): print ' var {:3}: {:15} {}'.format(idx, str(v.get_shape()), v.name) with tf.name_scope(v.name.replace(':0', '')): model_utils.variable_summaries(v) saver = tf.train.Saver(max_to_keep=5) checkpoint = tf.train.get_checkpoint_state(flags.load_model_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print 'model loaded: ', checkpoint.model_checkpoint_path else: print 'model not found' summary_writer = tf.summary.FileWriter(model_dir, sess.graph) # init environment world = GridWorld() env = GazeboWorld('robot1') obj_list = env.GetModelStates() cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255) FileProcess() print "Env initialized" rate = rospy.Rate(5.) T = 0 episode = 0 time.sleep(2.) # start learning demo_flag = True result_buf = [] data_buffer = data_utils.data_buffer(flags.buffer_size) training_start_time = time.time() while not rospy.is_shutdown() and episode < flags.max_epoch: time.sleep(1.) if demo_flag: world.CreateMap() if episode % 20 == 0: print 'randomising the environment' obj_pose_dict = world.RandomEnv(obj_list) for name in obj_pose_dict: env.SetObjectPose(name, obj_pose_dict[name]) time.sleep(2.) print 'randomisation finished' obj_list = env.GetModelStates() world.MapObjects(obj_list) world.GetAugMap() map_route, real_route, init_pose = world.RandomPath() demo_img_buf = [] demo_a_buf = [] eta = np.zeros([1], np.float32) gru_h = np.zeros([1, flags.n_hidden], np.float32) delta = flags.max_step / flags.demo_len if flags.demo_interval_mode == 'random': demo_indices = random.sample(range(flags.max_step - 1), flags.demo_len - 1) demo_indices += [flags.max_step - 1] demo_indices.sort() elif flags.demo_interval_mode == 'semi_random': demo_indices = [] for idx in range(flags.demo_len - 1): demo_indices.append(delta * idx + np.random.randint(delta)) demo_indices += [flags.max_step - 1] elif flags.demo_interval_mode == 'fixed': demo_indices = range(interval - 1, flags.max_step, interval) else: if result > 2: demo_flag = not demo_flag continue env.SetObjectPose('robot1', [init_pose[0], init_pose[1], 0., init_pose[2]], once=True) time.sleep(0.1) dynamic_route = copy.deepcopy(real_route) env.LongPathPublish(real_route) time.sleep(0.1) pose = env.GetSelfStateGT() if demo_flag: goal = real_route[-1] else: goal = last_position env.target_point = goal env.distance = np.sqrt( np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]])) total_reward = 0 prev_action = [0., 0.] epi_q = [] loop_time = [] t = 0 terminal = False img_seq = [] a_seq = [] reach_flag = False while not rospy.is_shutdown(): start_time = time.time() terminal, result, reward = env.GetRewardAndTerminate( t, max_step=flags.max_step) total_reward += reward if result == 1: reach_flag = True if result > 1 and demo_flag: break elif not demo_flag and result == 2: break rgb_image = env.GetRGBImageObservation() local_goal = env.GetLocalPoint(goal) pose = env.GetSelfStateGT() try: near_goal, dynamic_route = world.GetNextNearGoal( dynamic_route, pose) except: pass local_near_goal = env.GetLocalPoint(near_goal) env.PathPublish(local_near_goal) action_gt = env.Controller(local_near_goal, None, 1) img_seq.append(rgb_image) a_seq.append(action_gt) if not demo_flag: action, eta, gru_h = model.predict(demo_img_buf, demo_a_buf, eta[0], [rgb_image / 255.], gru_h) action = action[0] demo_idx = min(int(round(eta[0])), len(demo_img_buf) - 1) demo_img_pub = np.asarray(demo_img_buf[demo_idx] * 255., dtype=np.uint8) env.PublishDemoRGBImage(demo_img_pub, demo_idx) env.SelfControl(action, [0.3, np.pi / 6]) else: if t in demo_indices: demo_img_buf.append(rgb_image / 255.) demo_a_buf.append(action_gt) env.PublishDemoRGBImage(rgb_image, len(demo_a_buf) - 1) env.SelfControl(action_gt, [0.3, np.pi / 6]) t += 1 T += 1 rate.sleep() loop_time.append(time.time() - start_time) if len(img_seq) == flags.max_step and len( demo_img_buf) == flags.demo_len: data_buffer.save_sample(img_seq, a_seq, demo_img_buf, demo_a_buf) else: print 'data length incorrect:', len(img_seq), len(demo_img_buf) # training if len(data_buffer.data) >= flags.batch_size: for k in range(len(data_buffer.data) / flags.batch_size): batch_data = data_buffer.sample_a_batch(flags.batch_size) batch_img_seq, batch_action_seq, batch_demo_img_seq, batch_demo_action_seq = batch_data action_seq, eta_array, loss, _ = model.train( batch_demo_img_seq, batch_demo_action_seq, batch_img_seq, batch_action_seq) else: loss = -1. if not demo_flag: info_train = '| Episode:{:3d}'.format(episode) + \ '| t:{:3d}'.format(t) + \ '| Loss: {:2.5f}'.format(loss) + \ '| Reach: {:1d}'.format(int(reach_flag)) + \ '| Time(min): {:2.1f}'.format((time.time() - training_start_time)/60.) + \ '| LoopTime(s): {:.3f}'.format(np.mean(loop_time)) print info_train # summary = sess.run(merged) summary = tf.Summary() summary.value.add(tag='Result', simple_value=float(reach_flag)) summary_writer.add_summary(summary, episode) episode += 1 if flags.save_model and episode % 100 == 0: saver.save(sess, os.path.join(model_dir, 'network'), global_step=episode) else: last_position = pose[:2] demo_flag = not demo_flag
def testing(sess, model): # --------------------load model------------------------- model_dir = os.path.join(flags.model_dir, flags.model_name) if not os.path.exists(model_dir): os.makedirs(model_dir) init_dir = os.path.join(flags.model_dir, flags.init_model_name) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) all_var = tf.global_variables() load_var = all_var[:21] trainable_var = tf.trainable_variables() part_var = [] for idx, v in enumerate(load_var): print(" var {:3}: {:15} {}".format(idx, str(v.get_shape()), v.name)) if 'encoder' in v.name or 'controller' in v.name: part_var.append(v) saver = tf.train.Saver(trainable_var, max_to_keep=5) part_saver = tf.train.Saver(load_var, max_to_keep=1) # load model checkpoint = tf.train.get_checkpoint_state(model_dir) if checkpoint and checkpoint.model_checkpoint_path: part_saver.restore(sess, checkpoint.model_checkpoint_path) print("Network model loaded: ", checkpoint.model_checkpoint_path) else: print('No model is found') # ----------------------init env------------------------- # np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) world = GridWorld() switch_action = False # world.map, switch_action = world.RandomSwitchRoom() world.GetAugMap() world.CreateTable() cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255) FileProcess() robot_name = 'robot1' rviz = False env = GazeboWorld(world.table, robot_name, rviz=rviz) print "Env initialized" time.sleep(2.) rate = rospy.Rate(5.) pickle_path = os.path.join(CWD, 'world/model_states_data.p') env.GetModelStates() env.ResetWorld() env.ResetModelsPose(pickle_path) if switch_action: env.SwitchRoom(switch_action) env.state_call_back_flag = True env.target_theta_range = 1. time.sleep(2.) init_pose = world.RandomInitPose() env.target_point = init_pose episode = 0 T = 0 # ------------------start to test------------------------ room_position = np.array([-1, -1]) SR_cnt = 0 TIME_list = [] POSE_list = [] while not rospy.is_shutdown(): print '' time.sleep(1.) # set robot in a room init_pose, init_map_pose, room_position = world.RandomInitPoseInRoom( room_position) env.SetObjectPose(robot_name, [init_pose[0], init_pose[1], 0., init_pose[2]]) # generate a path to other room target_pose, target_map_pose, _ = world.RandomInitPoseInRoom( room_position) # get a long path map_plan, real_route = world.GetPath(init_map_pose + target_map_pose) dynamic_route = copy.deepcopy(real_route) env.LongPathPublish(real_route) if len(map_plan) == 0: print 'no path' continue pose = env.GetSelfStateGT() target_table_point_list, cmd_list, checkpoint_list = world.GetCommandPlan( pose, real_route) cmd_list += [0] print 'target_table_point_list:', target_table_point_list print 'cmd_list:', cmd_list if len(target_table_point_list) == 0: print 'no check point' continue table_goal = target_table_point_list[0] goal = world.Table2RealPosition(table_goal) env.target_point = goal env.distance = np.sqrt( np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]])) total_reward = 0 laser = env.GetLaserObservation() laser_stack = np.stack([laser, laser, laser], axis=-1) prev_action = [0., 0.] epi_q = [] loop_time = [] t = 0 terminal = False result = 0 cmd_idx = 0 prob = 0. used_action = [0., 0.] true_action = [0., 0.] logits = [0., 0.] cnt = 5 status = 0 prev_status = 0 exploration_noise = OUNoise(action_dimension=flags.dim_action, mu=flags.mu, theta=flags.theta, sigma=flags.sigma) epi_start_time = time.time() positions = [] CMD_list = [] while not rospy.is_shutdown(): start_time = time.time() if t == 0: print 'current cmd:', cmd_list[cmd_idx] prev_result = result terminal, result, reward = env.GetRewardAndTerminate(t) total_reward += reward # if t > 0: # print '{}, {}, | {:.3f}, {:.3f} | {}, {}, {}'.format(result, status, logits[0], logits[1], cmd[0], cmd_next[0], cmd_skip[0]) true_status = 0 if result > 1: break # elif result == 1: # true_status = 1 # if cmd_list[cmd_idx] == 5: # print 'Finish!!!!!!!' # break # cmd_idx += 1 # t = 0 # table_goal = target_table_point_list[cmd_idx] # goal = world.Table2RealPosition(table_goal) # env.target_point = goal # env.distance = np.sqrt(np.linalg.norm([pose[0]-goal[0], pose[1]-goal[1]])) # continue local_goal = env.GetLocalPoint(goal) env.PathPublish(local_goal) if cnt < 5: cnt += 1 cmd = [0] cmd_next = [cmd_list[cmd_idx]] cmd_skip = [0] else: cmd = [cmd_list[cmd_idx]] cmd_next = [0] cmd_skip = [cmd_list[cmd_idx + 1]] CMD_list.append(cmd[0]) curr_status = [cmd[0] * flags.n_cmd_type + cmd_next[0]] next_status = [cmd_next[0] * flags.n_cmd_type + cmd_skip[0]] prev_laser = laser laser = env.GetLaserObservation() if np.random.rand(): pass laser_stack = np.stack( [laser, laser_stack[:, 0], laser_stack[:, 1]], axis=-1) # predict action if cmd_list[cmd_idx] == 5: local_goal_a = local_goal else: local_goal_a = [0., 0.] prev_status = copy.deepcopy(status) status, action, logits, _ = model.Predict( [laser_stack], [curr_status], [next_status], [local_goal_a], t, [[used_action[0], used_action[1]]]) action += (exploration_noise.noise() * np.asarray(model.action_range) * 0.1) if prev_status == 0 and status == 1 and cnt >= 5: if cmd_list[cmd_idx] == 5: print 'Finish!!!!!!!' break cmd_idx += 1 t = 0 cnt = 0 table_goal = target_table_point_list[cmd_idx] goal = world.Table2RealPosition(table_goal) env.target_point = goal env.distance = np.sqrt( np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]])) continue # get action pose = env.GetSelfStateGT() near_goal, dynamic_route = world.GetNextNearGoal( dynamic_route, pose) local_near_goal = env.GetLocalPoint(near_goal) true_action = env.Controller(local_near_goal, None, 1) positions.append(pose[:2]) # print action - np.asarray(true_action), action if cmd_list[cmd_idx] == 0: used_action = true_action else: used_action = action env.SelfControl(used_action, [flags.a_linear_range, flags.a_angular_range]) t += 1 T += 1 loop_time.append(time.time() - start_time) rate.sleep() print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:}'.format( episode, t, total_reward, T) episode += 1 if cmd[0] == 5: SR_cnt += 1.0 TIME_list.append(time.time() - epi_start_time) # LogData(positions, CMD_list, int(SR_cnt), os.path.join(CWD, 'experiments/positions')) print 'SR: {:.3f} | AVG TIME: {:.3f}, | MAX TIME: {:.3f}'.format( SR_cnt / episode, np.mean(TIME_list), np.amax(TIME_list))
def testing(sess, model): # init network model_dir = os.path.join(flags.model_dir, flags.model_name) if not os.path.exists(model_dir): os.makedirs(model_dir) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(init_op) trainable_var = tf.trainable_variables() part_var = [] for idx, v in enumerate(trainable_var): print ' var {:3}: {:15} {}'.format(idx, str(v.get_shape()), v.name) with tf.name_scope(v.name.replace(':0', '')): model_utils.variable_summaries(v) saver = tf.train.Saver(max_to_keep=5) checkpoint = tf.train.get_checkpoint_state(model_dir) if checkpoint and checkpoint.model_checkpoint_path: saver.restore(sess, checkpoint.model_checkpoint_path) print 'model loaded: ', checkpoint.model_checkpoint_path else: print 'model not found' # init environment world = GridWorld() env = GazeboWorld('robot1') obj_list = env.GetModelStates() cv2.imwrite('./world/map.png', np.flipud(1 - world.map) * 255) FileProcess() print "Env initialized" rate = rospy.Rate(5.) T = 0 episode = 0 time.sleep(2.) # start testing demo_flag = True result_buf = [] while not rospy.is_shutdown() and episode < 20: time.sleep(2.) if demo_flag: world.CreateMap() if (episode + 1) % 20 == 0: print 'randomising the environment' obj_pose_dict = world.RandomEnv(obj_list) for name in obj_pose_dict: env.SetObjectPose(name, obj_pose_dict[name]) time.sleep(2.) print 'randomisation finished' obj_list = env.GetModelStates() world.MapObjects(obj_list) world.GetAugMap() map_route, real_route, init_pose = world.RandomPath() demo_img_buf = [] demo_a_buf = [] eta = np.zeros([1], np.float32) gru_h = np.zeros([1, flags.n_hidden], np.float32) delta = np.random.randint(flags.max_step / flags.demo_len) delta = flags.max_step / flags.demo_len - 1 else: if result > 2: continue env.SetObjectPose('robot1', [init_pose[0], init_pose[1], 0., init_pose[2]], once=True) time.sleep(1) dynamic_route = copy.deepcopy(real_route) env.LongPathPublish(real_route) time.sleep(1.) pose = env.GetSelfStateGT() goal = real_route[-1] env.target_point = goal env.distance = np.sqrt( np.linalg.norm([pose[0] - goal[0], pose[1] - goal[1]])) total_reward = 0 prev_action = [0., 0.] epi_q = [] loop_time = [] t = 0 terminal = False while not rospy.is_shutdown(): terminal, result, reward = env.GetRewardAndTerminate( t, max_step=flags.max_step) total_reward += reward if terminal: break rgb_image = env.GetRGBImageObservation() if demo_flag: local_goal = env.GetLocalPoint(goal) # env.PathPublish(local_goal) pose = env.GetSelfStateGT() try: near_goal, dynamic_route = world.GetNextNearGoal( dynamic_route, pose) except: pass local_near_goal = env.GetLocalPoint(near_goal) action = env.Controller(local_near_goal, None, 1) # if (t+1) % (flags.max_step/flags.demo_len) == 0: # demo_img_buf.append(rgb_image/255.) # demo_a_buf.append(action) # env.PublishDemoRGBImage(rgb_image, len(demo_a_buf)-1) if t == (flags.max_step / flags.demo_len) * len(demo_img_buf) + delta: demo_img_buf.append(rgb_image / 255.) demo_a_buf.append(action) env.PublishDemoRGBImage(rgb_image, len(demo_a_buf) - 1) # delta = np.random.randint(flags.max_step/flags.demo_len) else: start_time = time.time() action, eta, gru_h = model.predict(demo_img_buf, demo_a_buf, eta[0], [rgb_image / 255.], gru_h) loop_time.append(time.time() - start_time) action = action[0] demo_idx = min(int(round(eta[0])), len(demo_img_buf) - 1) demo_img_pub = np.asarray(demo_img_buf[demo_idx] * 255., dtype=np.uint8) env.PublishDemoRGBImage(demo_img_pub, demo_idx) env.SelfControl(action, [0.3, np.pi / 6]) t += 1 T += 1 rate.sleep() # print '{:.4f}'.format(time.time() - start_time) if demo_flag: print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Demo: {:}'.format( episode, t, total_reward, T, demo_flag) else: result_buf.append(result) print 'Episode:{:} | Steps:{:} | Reward:{:.2f} | T:{:} | Demo: {:}, | PredTime: {:.4f}'.format( episode, t, total_reward, T, demo_flag, np.mean(loop_time)) episode += 1 demo_flag = not demo_flag result_buf = np.asarray(result_buf) result_buf[result_buf <= 2] = 1 result_buf[result_buf > 2] = 0 print 'success rate: {:.2f}'.format(np.mean(result_buf))