Esempio n. 1
0
def run(comm, env, policy, policy_path, action_bound, optimizer):

    # rate = rospy.Rate(5)
    buff = []
    global_update = ID_EP
    global_step = 0

    if env.index == 0:
        env.reset_world()
    env.store_resetPose()

    for id in range(MAX_EPISODES):
        env.generate_goal_point()
        print("new goal:", env.goal_point, "and rnak:", env.mpi_rank)
        terminal = False
        next_ep = False
        ep_reward = 0
        step = 1

        env.reset_pose()

        obs = env.get_laser_observation()
        if (not LASER_NORM):
            obs = (obs + 0.5) * 3.5
        obs_stack = deque([obs, obs, obs])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [obs_stack, goal, speed]

        while not next_ep and not rospy.is_shutdown():

            state_list = comm.gather(state, root=0)

            # generate actions at rank==0
            v, a, logprob, scaled_action = generate_action(
                env=env,
                state_list=state_list,
                policy=policy,
                action_bound=action_bound)

            # execute actions
            real_action = comm.scatter(scaled_action, root=0)
            env.control_vel(real_action)

            # rate.sleep()
            rospy.sleep(0.001)

            # get informtion
            r, terminal, result = env.get_reward_and_terminate(step)
            ep_reward += r
            global_step += 1

            if (result == "Reach Goal"):
                env.generate_goal_point()
                print("new goal:", env.goal_point)
            if (terminal):
                env.reset_pose()
                obs = env.get_laser_observation()
                # if(not LASER_NORM):
                #     obs = (obs + 0.5)*3.5
                obs_stack = deque([obs, obs, obs])
            if (step > EP_LEN):
                next_ep = True

            # get next state
            s_next = env.get_laser_observation()
            # if(not LASER_NORM):
            #     s_next = (s_next + 0.5) * 3.5
            left = obs_stack.popleft()
            obs_stack.append(s_next)
            goal_next = np.asarray(env.get_local_goal())
            speed_next = np.asarray(env.get_self_speed())
            state_next = [obs_stack, goal_next, speed_next]

            if global_step % HORIZON == 0:
                state_next_list = comm.gather(state_next, root=0)
                last_v, _, _, _ = generate_action(env=env,
                                                  state_list=state_next_list,
                                                  policy=policy,
                                                  action_bound=action_bound)
            # add transitons in buff and update policy
            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)

            if env.mpi_rank == 0:
                buff.append((state_list, a, r_list, terminal_list, logprob, v))
                if len(buff) > HORIZON - 1:
                    s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \
                        transform_buffer(buff=buff)
                    t_batch, advs_batch = generate_train_data(
                        rewards=r_batch,
                        gamma=GAMMA,
                        values=v_batch,
                        last_value=last_v,
                        dones=d_batch,
                        lam=LAMDA)
                    memory = (s_batch, goal_batch, speed_batch, a_batch,
                              l_batch, t_batch, v_batch, r_batch, advs_batch)
                    print("PPO update start")
                    ppo_update_stage1(policy=policy,
                                      optimizer=optimizer,
                                      batch_size=BATCH_SIZE,
                                      memory=memory,
                                      epoch=EPOCH,
                                      coeff_entropy=COEFF_ENTROPY,
                                      clip_value=CLIP_VALUE,
                                      num_step=HORIZON,
                                      num_env=NUM_ENV,
                                      frames=LASER_HIST,
                                      obs_size=OBS_SIZE,
                                      act_size=ACT_SIZE)

                    print("PPO update finish")
                    buff = []
                    global_update += 1

            step += 1
            state = state_next

        if env.mpi_rank == 0:
            if global_update != 0 and global_update % 20 == 0:
                torch.save(policy.state_dict(),
                           policy_path + '/Stage1_{}'.format(global_update))
                logger.info(
                    '########################## model saved when update {} times#########'
                    '################'.format(global_update))
        # distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2)
        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 +
                           (env.goal_point[1] - env.init_pose[1])**2)

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \
                    (env.mpi_rank, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result))
        print("befpre ep_reward")
        logger_cal.info(ep_reward)
Esempio n. 2
0
def run(comm, env, policy, policy_path, action_bound, optimizer):
    rate = rospy.Rate(40)
    buff = []
    global_update = 0
    global_step = 0
    step = 0
    ep_reward=0

    # import pdb; pdb.set_trace()

    if env.index == 0:
        # import pdb; pdb.set_trace()
        env.reset_world()

    for id in range(MAX_EPISODES):
        # env.reset_pose()
        env.generate_goal_point()

        obs = env.get_laser_observation()
        obs_stack = deque([obs, obs, obs])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [obs_stack, goal, speed]

        while not rospy.is_shutdown():
            state_list = comm.gather(state, root=0)

            # generate actions at rank==0
            v, a, logprob, scaled_action=generate_action(env=env, state_list=state_list,
                                                         policy=policy, action_bound=action_bound)
            # execute actions
            real_action = comm.scatter(scaled_action, root=0)
            # if liveflag == True:
            # print(real_action)
            env.control_vel(real_action)
            # rate.sleep()
            rospy.sleep(0.001)
            # get informtion
            r, terminal, result = env.get_reward_and_terminate(step)
            print('r: ', r)
            # r, terminal, result = 1,0,1 #env.get_reward_and_terminate(step)
            step += 1

            ep_reward += r
            if terminal == True:
                print('done:, ', result)
                # import pdb; pdb.set_trace()
                env.reset_world()
                ep_reward=0
                step = 1

            global_step += 1

            # get next state
            s_next = env.get_laser_observation()
            left = obs_stack.popleft()
            obs_stack.append(s_next)
            goal_next = np.asarray(env.get_local_goal())
            speed_next = np.asarray(env.get_self_speed())
            state_next = [obs_stack, goal_next, speed_next]


            if global_step % HORIZON == 0:
                state_next_list = comm.gather(state_next, root=0)
                last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy,
                                                               action_bound=action_bound)
            # add transitons in buff and update policy
            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)

            terminal_list = comm.bcast(terminal_list, root=0)
            if env.index == 0:
                buff.append((state_list, a, r_list, terminal_list, logprob, v))
                if len(buff) > HORIZON - 1:
                    s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \
                        transform_buffer(buff=buff)
                    filter_index = get_filter_index(d_batch)
                    # print len(filter_index)
                    t_batch, advs_batch = generate_train_data(rewards=r_batch, gamma=GAMMA, values=v_batch,
                                                              last_value=last_v, dones=d_batch, lam=LAMDA)
                    memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch)
                    # import pdb; pdb.set_trace()
                    ppo_update_stage2(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory, filter_index=filter_index,
                                            epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON,
                                            num_env=NUM_ENV, frames=LASER_HIST,
                                            obs_size=OBS_SIZE, act_size=ACT_SIZE)

                    buff = []
                    global_update += 1


            state = state_next



        if env.index == 0:
            if global_update != 0 and global_update % 20 == 0:
                torch.save(policy.state_dict(), policy_path + '/gazebo_{}.pth'.format(global_update))
                logger.info('########################## model saved when update {} times#########'
                            '################'.format(global_update))

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, %s,' % \
                    (env.index, env.goal_point[0], env.goal_point[1], id, step-1, ep_reward, result))
        logger_cal.info(ep_reward)
Esempio n. 3
0
def run(comm, env, policy, policy_path, action_bound, optimizer):

    # rate = rospy.Rate(5)
    buff = []
    global_update = 0
    global_step = 0
    accumulated_social_reward = 0

    if env.index == 0:
        env.reset_world()

    for id in range(MAX_EPISODES):
        env.reset_pose()

        env.generate_goal_point()
        terminal = False
        ep_reward = 0
        accumulated_social_reward = 0
        step = 1

        obs = env.get_laser_observation()
        obs_stack = deque([obs, obs, obs])
        goal = np.asarray(env.get_local_goal())
        speed = np.asarray(env.get_self_speed())
        state = [obs_stack, goal, speed]

        while not terminal and not rospy.is_shutdown():
            state_list = comm.gather(state, root=0)

            # generate actions at rank==0
            v, a, logprob, scaled_action = generate_action(
                env=env,
                state_list=state_list,
                policy=policy,
                action_bound=action_bound)

            # execute actions
            real_action = comm.scatter(scaled_action, root=0)
            env.control_vel(real_action)

            # rate.sleep()
            rospy.sleep(0.001)

            # get informtion
            r, terminal, result, social_reward = env.get_reward_and_terminate(
                step, SOCIAL_CALC_MIN_DISTANCE, SOCIAL_CALC_DELTA_T,
                SOCIAL_REWARD_COEFF)
            ep_reward += r
            global_step += 1
            accumulated_social_reward += social_reward

            # get next state
            s_next = env.get_laser_observation()
            left = obs_stack.popleft()
            obs_stack.append(s_next)
            goal_next = np.asarray(env.get_local_goal())
            speed_next = np.asarray(env.get_self_speed())
            state_next = [obs_stack, goal_next, speed_next]

            if global_step % HORIZON == 0:
                state_next_list = comm.gather(state_next, root=0)
                last_v, _, _, _ = generate_action(env=env,
                                                  state_list=state_next_list,
                                                  policy=policy,
                                                  action_bound=action_bound)
            # add transitons in buff and update policy
            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)

            if env.index == 0:
                buff.append((state_list, a, r_list, terminal_list, logprob, v))
                if len(buff) > HORIZON - 1:
                    s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \
                        transform_buffer(buff=buff)
                    t_batch, advs_batch = generate_train_data(
                        rewards=r_batch,
                        gamma=GAMMA,
                        values=v_batch,
                        last_value=last_v,
                        dones=d_batch,
                        lam=LAMDA)
                    memory = (s_batch, goal_batch, speed_batch, a_batch,
                              l_batch, t_batch, v_batch, r_batch, advs_batch)
                    ppo_update_stage1(policy=policy,
                                      optimizer=optimizer,
                                      batch_size=BATCH_SIZE,
                                      memory=memory,
                                      epoch=EPOCH,
                                      coeff_entropy=COEFF_ENTROPY,
                                      clip_value=CLIP_VALUE,
                                      num_step=HORIZON,
                                      num_env=NUM_ENV,
                                      frames=LASER_HIST,
                                      obs_size=OBS_SIZE,
                                      act_size=ACT_SIZE)

                    buff = []
                    global_update += 1

            step += 1
            state = state_next

        if env.index == 0:
            if global_update != 0 and global_update % 20 == 0:
                torch.save(policy.state_dict(),
                           policy_path + '/Stage1_{}'.format(global_update))
                logger.info(
                    '########################## model saved when update {} times#########'
                    '################'.format(global_update))
        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 +
                           (env.goal_point[1] - env.init_pose[1])**2)

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s, ' % \
                    (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result))
        logger_cal.info(ep_reward)
        # log social reward
        logger_social.info("{},{}".format(env.index,
                                          accumulated_social_reward))
Esempio n. 4
0
def run(comm, env, policy, policy_path, action_bound, optimizer):

    # rate = rospy.Rate(5)
    buff = []
    global_update = 0
    global_step = 0

    #world reset
    if env.index == 0:
        env.reset_world()


    for id in range(MAX_EPISODES):
        
        #reset
        env.reset_pose()

        env.generate_goal_point()
        terminal = False
        ep_reward = 0
        step = 1

        # get_state

        state = env.get_state()

        while not terminal and not rospy.is_shutdown():
        
            state_list = comm.gather(state, root=0)


            ## get_action
            #-------------------------------------------------------------------------
            # generate actions at rank==0
            v, a, logprob, scaled_action=generate_action(env=env, state_list=state_list,
                                                         policy=policy, action_bound=action_bound)

            # execute actions
            real_action = comm.scatter(scaled_action, root=0)
            #-------------------------------------------------------------------------            
            
            ### step ############################################################
            
            state_next, r, terminal, result = env.step(real_action, step)

            ep_reward += r
            global_step += 1

            # add transitons in buff and update policy
            r_list = comm.gather(r, root=0)
            terminal_list = comm.gather(terminal, root=0)

            ########################################################################

            if global_step % HORIZON == 0:
                state_next_list = comm.gather(state_next, root=0)
                last_v, _, _, _ = generate_action(env=env, state_list=state_next_list, policy=policy,
                                                               action_bound=action_bound)
            
            
            ## training
            #-------------------------------------------------------------------------
            if env.index == 0:
                buff.append((state_list, a, r_list, terminal_list, logprob, v))
                if len(buff) > HORIZON - 1:
                    s_batch, goal_batch, speed_batch, a_batch, r_batch, d_batch, l_batch, v_batch = \
                        transform_buffer(buff=buff)
                    t_batch, advs_batch = generate_train_data(rewards=r_batch, gamma=GAMMA, values=v_batch,
                                                              last_value=last_v, dones=d_batch, lam=LAMDA)
                    memory = (s_batch, goal_batch, speed_batch, a_batch, l_batch, t_batch, v_batch, r_batch, advs_batch)
                    ppo_update_stage1(policy=policy, optimizer=optimizer, batch_size=BATCH_SIZE, memory=memory,
                                            epoch=EPOCH, coeff_entropy=COEFF_ENTROPY, clip_value=CLIP_VALUE, num_step=HORIZON,
                                            num_env=NUM_ENV, frames=LASER_HIST,
                                            obs_size=OBS_SIZE, act_size=ACT_SIZE)

                    buff = []
                    global_update += 1

            step += 1
            state = state_next


        if env.index == 0:
            if global_update != 0 and global_update % 20 == 0:
                torch.save(policy.state_dict(), policy_path + '/Stage1_{}'.format(global_update))
                logger.info('########################## model saved when update {} times#########'
                            '################'.format(global_update))
        distance = np.sqrt((env.goal_point[0] - env.init_pose[0])**2 + (env.goal_point[1]-env.init_pose[1])**2)

        logger.info('Env %02d, Goal (%05.1f, %05.1f), Episode %05d, setp %03d, Reward %-5.1f, Distance %05.1f, %s' % \
                    (env.index, env.goal_point[0], env.goal_point[1], id + 1, step, ep_reward, distance, result))
        logger_cal.info(ep_reward)