def run(comm, env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = ID_EP global_step = 0 if env.index == 0: env.reset_world() env.store_resetPose() for id in range(MAX_EPISODES): env.generate_goal_point() print("new goal:", env.goal_point, "and rnak:", env.mpi_rank) terminal = False next_ep = False ep_reward = 0 step = 1 env.reset_pose() obs = env.get_laser_observation() if (not LASER_NORM): obs = (obs + 0.5) * 3.5 obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] while not next_ep and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) # generate actions at rank==0 v, a, logprob, scaled_action = generate_action( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result = env.get_reward_and_terminate(step) ep_reward += r global_step += 1 if (result == "Reach Goal"): env.generate_goal_point() print("new goal:", env.goal_point) if (terminal): env.reset_pose() obs = env.get_laser_observation() # if(not LASER_NORM): # obs = (obs + 0.5)*3.5 obs_stack = deque([obs, obs, obs]) if (step > EP_LEN): next_ep = True # get next state s_next = env.get_laser_observation() # if(not LASER_NORM): # s_next = (s_next + 0.5) * 3.5 left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) if env.mpi_rank == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) t_batch, advs_batch = generate_train_data( rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) print("PPO update start") ppo_update_stage1(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) print("PPO update finish") buff = [] global_update += 1 step += 1 state = state_next if env.mpi_rank == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) # distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.mpi_rank, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) print("befpre ep_reward") logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer): rate = rospy.Rate(40) buff = [] global_update = 0 global_step = 0 step = 0 ep_reward=0 # import pdb; pdb.set_trace() if env.index == 0: # import pdb; pdb.set_trace() env.reset_world() for id in range(MAX_EPISODES): # env.reset_pose() env.generate_goal_point() obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] while not rospy.is_shutdown(): state_list = comm.gather(state, root=0) # generate actions at rank==0 v, a, logprob, scaled_action=generate_action(env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) # if liveflag == True: # print(real_action) env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result = env.get_reward_and_terminate(step) print('r: ', r) # r, terminal, result = 1,0,1 #env.get_reward_and_terminate(step) step += 1 ep_reward += r if terminal == True: print('done:, ', result) # import pdb; pdb.set_trace() env.reset_world() ep_reward=0 step = 1 global_step += 1 # get next state s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) terminal_list = comm.bcast(terminal_list, root=0) if env.index == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) filter_index = get_filter_index(d_batch) # print len(filter_index) t_batch, advs_batch = generate_train_data(rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) # import pdb; pdb.set_trace() ppo_update_stage2(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, filter_index=filter_index, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 state = state_next if env.index == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/gazebo_{}.pth'.format(global_update)) logger.info('########################## model saved when update {} times#########' '################'.format(global_update)) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, %s,' % \ (env.index, env.goal_point[0], env.goal_point[1], id, step-1, ep_reward, result)) logger_cal.info(ep_reward)
def run(comm, env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = 0 global_step = 0 accumulated_social_reward = 0 if env.index == 0: env.reset_world() for id in range(MAX_EPISODES): env.reset_pose() env.generate_goal_point() terminal = False ep_reward = 0 accumulated_social_reward = 0 step = 1 obs = env.get_laser_observation() obs_stack = deque([obs, obs, obs]) goal = np.asarray(env.get_local_goal()) speed = np.asarray(env.get_self_speed()) state = [obs_stack, goal, speed] while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) # generate actions at rank==0 v, a, logprob, scaled_action = generate_action( env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) env.control_vel(real_action) # rate.sleep() rospy.sleep(0.001) # get informtion r, terminal, result, social_reward = env.get_reward_and_terminate( step, SOCIAL_CALC_MIN_DISTANCE, SOCIAL_CALC_DELTA_T, SOCIAL_REWARD_COEFF) ep_reward += r global_step += 1 accumulated_social_reward += social_reward # get next state s_next = env.get_laser_observation() left = obs_stack.popleft() obs_stack.append(s_next) goal_next = np.asarray(env.get_local_goal()) speed_next = np.asarray(env.get_self_speed()) state_next = [obs_stack, goal_next, speed_next] if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) if env.index == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) t_batch, advs_batch = generate_train_data( rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) ppo_update_stage1(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 step += 1 state = state_next if env.index == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info( '########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1] - env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s, ' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward) # log social reward logger_social.info("{},{}".format(env.index, accumulated_social_reward))
def run(comm, env, policy, policy_path, action_bound, optimizer): # rate = rospy.Rate(5) buff = [] global_update = 0 global_step = 0 #world reset if env.index == 0: env.reset_world() for id in range(MAX_EPISODES): #reset env.reset_pose() env.generate_goal_point() terminal = False ep_reward = 0 step = 1 # get_state state = env.get_state() while not terminal and not rospy.is_shutdown(): state_list = comm.gather(state, root=0) ## get_action #------------------------------------------------------------------------- # generate actions at rank==0 v, a, logprob, scaled_action=generate_action(env=env, state_list=state_list, policy=policy, action_bound=action_bound) # execute actions real_action = comm.scatter(scaled_action, root=0) #------------------------------------------------------------------------- ### step ############################################################ state_next, r, terminal, result = env.step(real_action, step) ep_reward += r global_step += 1 # add transitons in buff and update policy r_list = comm.gather(r, root=0) terminal_list = comm.gather(terminal, root=0) ######################################################################## if global_step % HORIZON == 0: state_next_list = comm.gather(state_next, root=0) last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy, action_bound=action_bound) ## training #------------------------------------------------------------------------- if env.index == 0: buff.append((state_list, a, r_list, terminal_list, logprob, v)) if len(buff) > HORIZON - 1: s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \ transform_buffer(buff=buff) t_batch, advs_batch = generate_train_data(rewards=r_batch, gamma=GAMMA, values=v_batch, last_value=last_v, dones=d_batch, lam=LAMDA) memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch) ppo_update_stage1(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON, num_env=NUM_ENV, frames=LASER_HIST, obs_size=OBS_SIZE, act_size=ACT_SIZE) buff = [] global_update += 1 step += 1 state = state_next if env.index == 0: if global_update != 0 and global_update % 20 == 0: torch.save(policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update)) logger.info('########################## model saved when update {} times#########' '################'.format(global_update)) distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2) logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \ (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result)) logger_cal.info(ep_reward)