def main():
    render = True

    # Parameters
    horizon = 60
    total_repeats = 50

    state_dim = 12
    action_dim = 2
    env_name = 'SawyerSlide'

    ### Prepare Environment #####
    env = make(
        'SawyerSlide',
        gripper_type="SlidePanelGripper",
        parameters_to_randomise=slide_full_randomisation,
        randomise_initial_conditions=False,
        use_camera_obs=False,
        use_object_obs=True,
        reward_shaping=True,
        use_indicator_object=False,
        has_renderer=render,
        has_offscreen_renderer=False,
        render_collision_mesh=False,
        render_visual_mesh=True,
        control_freq=10,
        horizon=60,
        ignore_done=False,
        camera_name="frontview",
        camera_height=256,
        camera_width=256,
        camera_depth=False,
        pid=True,
    )

    env = FlattenWrapper(GymWrapper(env, ), keys='task-state', add_info=True)
    env._name = env_name
    env.reset()

    for policy_idx in range(1):
        ##### SETTING UP THE POLICY #########
        method = ['Single', 'LSTM', 'EPI', 'UPOSI'][policy_idx]
        if (method == 'Single'):
            alg_idx = 1
        elif (method == 'LSTM'):
            alg_idx = 2
        elif (method == 'UPOSI'):
            alg_idx = 3
            osi_l = 5
            RANDOMISZED_ONLY = True
            DYNAMICS_ONLY = True
            CAT_INTERNAL = False  # sliding env no need for internal state and action
            params = query_params(env,
                                  randomised_only=RANDOMISZED_ONLY,
                                  dynamics_only=DYNAMICS_ONLY)
            params_dim = params.shape[
                0]  # dimension of parameters for prediction
            if CAT_INTERNAL:
                internal_state_dim = env.get_internal_state_dimension()
                _, _, _, info = env.step(np.zeros(action_dim))
                internal_action_dim = np.array(
                    info["joint_velocities"]).shape[0]
                osi_input_dim = osi_l * (state_dim + action_dim +
                                         internal_state_dim +
                                         internal_action_dim)
            else:
                osi_input_dim = osi_l * (state_dim + action_dim)
            state_dim += params_dim
        elif (method == 'EPI'):
            alg_idx = 4
            embed_dim = 10
            traj_l = 10
            NO_RESET = True
            embed_input_dim = traj_l * (state_dim + action_dim)
            ori_state_dim = state_dim
            state_dim += embed_dim
        else:
            continue

        alg_name = ['sac', 'td3', 'lstm_td3', 'uposi_td3', 'epi_td3'][alg_idx]
        if method == 'EPI' or method == 'UPOSI':
            randomisation_idx_range = 1
        else:
            randomisation_idx_range = 4

        for randomisation_idx in range(1, 2):  #randomisation_idx_range
            goal_idx = 1
            ###### SETTING UP THE ENVIRONMENT ######
            randomisation_params = [
                slide_full_randomisation, [], slide_force_randomisation,
                slide_force_noise_randomisation
            ]
            env.change_parameters_to_randomise(
                randomisation_params[randomisation_idx])
            env.reset()

            if (render):
                # env.viewer.set_camera(camera_id=0)
                env.render()

            # choose which randomisation is applied
            randomisation_type = ['slide_full-randomisation', 'slide_no-randomisation', \
                                  'slide_force-randomisation', 'slide_force-&-noise-randomisation'][
                randomisation_idx]
            number_random_params = [22, 0, 2, 8][randomisation_idx]
            folder_path = '../../../../sawyer/src/sim2real_dynamics_sawyer/assets/rl/' + method + '/' + alg_name + '/model/'
            path = folder_path + env_name + str(
                number_random_params) + '_' + alg_name
            try:
                policy = load(path=path,
                              alg=alg_name,
                              state_dim=state_dim,
                              action_dim=action_dim)
                if method == 'UPOSI':
                    osi_model = load_model(model_name='osi',
                                           path=path,
                                           input_dim=osi_input_dim,
                                           output_dim=params_dim)
                elif method == 'EPI':
                    embed_model = load_model(model_name='embedding',
                                             path=path,
                                             input_dim=embed_input_dim,
                                             output_dim=embed_dim)
                    embed_model.cuda()
                    epi_policy_path = folder_path + env_name + str(
                        number_random_params) + '_' + 'epi_ppo_epi_policy'
                    epi_policy = load(path=epi_policy_path,
                                      alg='ppo',
                                      state_dim=ori_state_dim,
                                      action_dim=action_dim)

            except:
                print(method, randomisation_type)
                continue

            if (alg_name == 'lstm_td3'):
                hidden_out = (torch.zeros([1, 1, 512], dtype=torch.float).cuda(), \
                              torch.zeros([1, 1, 512],
                                          dtype=torch.float).cuda())  # initialize hidden state for lstm, (hidden, cell), each is (layer, batch, dim)
                last_action = np.array([0, 0])
            ######################################

            log_save_name = '{}_{}_{}_{}'.format(method, alg_name,
                                                 randomisation_type,
                                                 number_random_params)

            for repeat in range(total_repeats):
                # Reset environment
                obs = env.reset()
                if (render):
                    # env.viewer.set_  (camera_id=0)
                    env.render()

                # Setup logger
                log_path = '../../../../data/sliding/sim/{}/goal_{}/trajectory_log_{}.csv'.format(
                    log_save_name, goal_idx, repeat)

                log_list = [
                    "step",
                    "time",
                    "cmd_j5",
                    "cmd_j6",
                    "obj_x",
                    "obj_y",
                    "obj_z",
                    "sin_z",
                    "cos_z",
                    "obj_vx",
                    "obj_vy",
                    "obj_vz",
                    "a_j5",
                    "a_j6",
                    "v_j5",
                    "v_j6",
                ]
                logger = Logger(log_list, log_path, verbatim=render)

                i = 0
                mujoco_start_time = env.sim.data.time

                if (alg_name == 'uposi_td3'):
                    uposi_traj = []
                    # params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY)
                    zero_osi_input = np.zeros(osi_input_dim)
                    pre_params = osi_model(zero_osi_input).detach().numpy()
                    params_state = np.concatenate((pre_params, obs))
                elif (alg_name == 'epi_td3'):

                    if NO_RESET:

                        i = traj_l - 1

                        traj, [last_obs, last_state] = EPIpolicy_rollout(
                            env,
                            epi_policy,
                            obs,
                            mujoco_start_time=mujoco_start_time,
                            logger=logger,
                            data_grabber=None,
                            max_steps=traj_l,
                            params=None
                        )  # only one traj; pass in params to ensure it's not changed
                        state_action_in_traj = np.array(
                            traj)[:, :-1]  # remove the rewards
                        embedding = embed_model(
                            state_action_in_traj.reshape(-1))
                        embedding = embedding.detach().cpu().numpy()

                        obs = last_obs  # last observation
                        env.set_state(last_state)  # last underlying state
                    else:

                        traj, [last_obs, last_state] = EPIpolicy_rollout(
                            env,
                            epi_policy,
                            obs,
                            mujoco_start_time=mujoco_start_time,
                            logger=None,
                            data_grabber=None,
                            max_steps=traj_l,
                            params=None
                        )  # only one traj; pass in params to ensure it's not changed
                        state_action_in_traj = np.array(
                            traj)[:, :-1]  # remove the rewards
                        embedding = embed_model(
                            state_action_in_traj.reshape(-1))
                        embedding = embedding.detach().cpu().numpy()

                        params = env.get_dynamics_parameters()
                        env.randomisation_off()
                        env.set_dynamics_parameters(
                            params)  # same as the rollout env
                        obs = env.reset()
                        env.randomisation_on()

                    obs = np.concatenate((obs, embedding))

                while (True):
                    mujoco_elapsed = env.sim.data.time - mujoco_start_time

                    #### CHOOSING THE ACTION #####
                    if (alg_name == 'lstm_td3'):
                        hidden_in = hidden_out
                        action, hidden_out = policy.get_action(obs,
                                                               last_action,
                                                               hidden_in,
                                                               noise_scale=0.0)
                        last_action = action
                    elif (alg_name == 'uposi_td3'):
                        # using internal state or not
                        if CAT_INTERNAL:
                            internal_state = env.get_internal_state()
                            full_state = np.concatenate([obs, internal_state])
                        else:
                            full_state = obs
                        action = policy.get_action(params_state,
                                                   noise_scale=0.0)
                    else:
                        action = policy.get_action(obs, noise_scale=0.0)
                    ###############################

                    next_obs, reward, done, info = env.step(action)
                    if (alg_name == 'uposi_td3'):
                        if CAT_INTERNAL:
                            target_joint_action = info["joint_velocities"]
                            full_action = np.concatenate(
                                [action, target_joint_action])
                        else:
                            full_action = action
                        uposi_traj.append(
                            np.concatenate((full_state, full_action)))

                        if len(uposi_traj) >= osi_l:
                            osi_input = stack_data(uposi_traj, osi_l)
                            pre_params = osi_model(osi_input).detach().numpy()
                        else:
                            zero_osi_input = np.zeros(osi_input_dim)
                            pre_params = osi_model(
                                zero_osi_input).detach().numpy()

                        next_params_state = np.concatenate(
                            (pre_params, next_obs))
                        params_state = next_params_state
                    elif (alg_name == 'epi_td3'):
                        next_obs = np.concatenate((next_obs, embedding))
                    # Grab logging data
                    # assert len(obs) == 12

                    logger.log(
                        i,
                        mujoco_elapsed,
                        action[0],
                        action[1],
                        obs[0],
                        obs[1],
                        obs[2],
                        obs[3],
                        obs[4],
                        obs[5],
                        obs[6],
                        obs[7],
                        obs[8],
                        obs[9],
                        obs[10],
                        obs[11],
                    )

                    obs = next_obs

                    if (render):
                        env.render()

                    i += 1
                    if (i >= horizon or done):
                        break

    env.close()
def worker(id, td3_trainer, osi_model, osi_l, osi_input_dim, osi_batch_size, osi_itr, osi_pretrain_eps, env_name, environment_params, environment_wrappers,\
             environment_wrapper_arguments, rewards_queue, eval_rewards_queue, success_queue,\
            eval_success_queue, osi_loss_queue, eval_interval, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, noise_decay, update_itr, explore_noise_scale, \
            eval_noise_scale, reward_scale, DETERMINISTIC, hidden_dim, model_path, seed=1):
    '''
    the function for sampling with multi-processing
    '''
    with torch.cuda.device(id % torch.cuda.device_count()):
        td3_trainer.to_cuda()
        print(td3_trainer, replay_buffer)
        env = make_env('robosuite.' + env_name, seed, id, environment_params,
                       environment_wrappers, environment_wrapper_arguments)()
        action_dim = env.action_space.shape[0]
        frame_idx = 0
        rewards = []
        history_data = []
        params_list = []
        current_explore_noise_scale = explore_noise_scale
        # training loop
        for eps in range(max_episodes):
            episode_reward = 0
            epi_traj = []
            state = env.reset()
            # attach the dynamics parameters as states
            params = query_params(
                env,
                randomised_only=RANDOMISZED_ONLY,
                dynamics_only=DYNAMICS_ONLY
            )  # in reality, the true params cannot be queried
            params_state = np.concatenate((params, state))
            current_explore_noise_scale = current_explore_noise_scale * noise_decay

            for step in range(max_steps):
                # using internal state or not
                if CAT_INTERNAL:
                    internal_state = env.get_internal_state()
                    full_state = np.concatenate([state, internal_state])
                else:
                    full_state = state
                # exploration steps
                if frame_idx > explore_steps:
                    action = td3_trainer.policy_net.get_action(
                        params_state, noise_scale=current_explore_noise_scale)
                else:
                    action = td3_trainer.policy_net.sample_action()

                try:
                    next_state, reward, done, info = env.step(action)
                    if environment_params[
                            "has_renderer"] and environment_params[
                                "render_visual_mesh"]:
                        env.render()
                except KeyboardInterrupt:
                    print('Finished')
                    if osi_model is None:
                        td3_trainer.save_model(model_path)
                except MujocoException:
                    print('MujocoException')
                    break

                if info["unstable"]:  # capture the case with cube flying away for pushing task
                    break

                if CAT_INTERNAL:
                    target_joint_action = info["joint_velocities"]
                    full_action = np.concatenate([action, target_joint_action])
                else:
                    full_action = action
                epi_traj.append(np.concatenate(
                    (full_state,
                     full_action)))  # internal state for osi only, not for up

                if len(
                        epi_traj
                ) >= osi_l and osi_model is not None and eps > osi_pretrain_eps:
                    osi_input = stack_data(
                        epi_traj, osi_l
                    )  # stack (s,a) to have same length as in the model input
                    pre_params = osi_model(osi_input).detach().numpy()
                else:
                    pre_params = params

                if osi_model is not None and eps > osi_pretrain_eps:
                    if len(epi_traj) >= osi_l:
                        osi_input = stack_data(
                            epi_traj, osi_l
                        )  # stack (s,a) to have same length as in the model input
                        pre_params = osi_model(osi_input).detach().numpy()
                    else:
                        zero_osi_input = np.zeros(osi_input_dim)
                        pre_params = osi_model(zero_osi_input).detach().numpy()
                else:
                    pre_params = params

                next_params_state = np.concatenate((pre_params, next_state))
                replay_buffer.push(params_state, action, reward,
                                   next_params_state, done)

                state = next_state
                params_state = next_params_state
                episode_reward += reward
                frame_idx += 1

                if replay_buffer.get_length(
                ) > batch_size and osi_model is None:
                    for i in range(update_itr):
                        _ = td3_trainer.update(
                            batch_size,
                            eval_noise_scale=eval_noise_scale,
                            reward_scale=reward_scale)

                if done:
                    break

            if osi_model is not None:  # train osi
                history_data.append(np.array(epi_traj))
                params_list.append(params)

                if eps % osi_batch_size == 0 and eps > 0:
                    label, data = generate_data(params_list,
                                                history_data,
                                                length=osi_l)
                    osi_model, loss = OnlineSIupdate(osi_model,
                                                     data,
                                                     label,
                                                     epoch=osi_itr)
                    osi_loss_queue.put(loss)
                    # clear the dataset; alternative: using a offline buffer
                    params_list = []
                    history_data = []
                    torch.save(osi_model.state_dict(), model_path + '_osi')
                    print('OSI Episode: {} | Epoch Loss: {}'.format(eps, loss))
            else:  # train up
                print('Worker: ', id, '|Episode: ', eps, '| Episode Reward: ',
                      episode_reward)
                rewards_queue.put(episode_reward)
                success_queue.put(info['success'])

                if eps % eval_interval == 0 and eps > 0:
                    # plot(rewards, id)
                    td3_trainer.save_model(model_path)
                    eval_r, eval_succ = evaluate(
                        env,
                        td3_trainer.policy_net,
                        up=True,
                        randomised_only=RANDOMISZED_ONLY,
                        dynamics_only=DYNAMICS_ONLY)
                    eval_rewards_queue.put(eval_r)
                    eval_success_queue.put(eval_succ)

        if osi_model is not None:  # train osi
            torch.save(osi_model.state_dict(), model_path + '_osi')
        else:  # train up
            td3_trainer.save_model(model_path)
    # the replay buffer is a class, have to use torch manager to make it a proxy for sharing across processes
    BaseManager.register('ReplayBuffer', ReplayBuffer)
    manager = BaseManager()
    manager.start()
    replay_buffer = manager.ReplayBuffer(
        replay_buffer_size)  # share the replay buffer through manager

    env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(
        env_name)
    prefix = env_name + str(len(environment_params["parameters_to_randomise"])
                            )  # number of randomised parameters
    model_path = '../../../../../data/uposi_td3/model/' + prefix + '_uposi_td3'

    params = query_params(env,
                          randomised_only=RANDOMISZED_ONLY,
                          dynamics_only=DYNAMICS_ONLY)
    params_dim = params.shape[0]  # dimension of parameters for prediction
    print('Dimension of parameters for prediction: {}'.format(params_dim))
    action_space = env.action_space
    ini_state_space = env.observation_space
    state_space = spaces.Box(
        -np.inf, np.inf, shape=(ini_state_space.shape[0] +
                                params_dim, ))  # add the dynamics param dim
    if CAT_INTERNAL:
        internal_state_dim = env.get_internal_state_dimension()
        print('Dimension of internal state: ', internal_state_dim)
        _, _, _, info = env.step(np.zeros(action_space.shape[0]))
        internal_action_dim = np.array(info["joint_velocities"]).shape[0]
        print('Dimension of internal action: ', internal_action_dim)
        osi_input_dim = osi_l * (ini_state_space.shape[0] +
Esempio n. 4
0
def evaluate(env,
             policy,
             up=False,
             eval_eipsodes=5,
             Projection=False,
             proj_net=None,
             num_envs=5):
    '''
    Evaluate the policy during training time with fixed dynamics
    :param: eval_episodes: evaluating episodes for each env
    :param: up (bool): universal policy conditioned on dynamics parameters
    '''
    number_of_episodes = eval_eipsodes
    return_per_test_environment = []
    success_per_test_environment = []
    # initial_dynamics_params = env.get_dynamics_parameters()
    dynamics_params = testing_envionments_generator_random(
        env, num_envs)  # before randomisation off
    ### SUGGESTED CHANGE ####
    initial_dynamics_params, _ = env.randomisation_off()
    ###################

    # do visualization
    # print('Starting testing')
    for params in dynamics_params:
        mean_return = 0.0
        success = 0

        # print('with parameters {} ...'.format(params))
        for i in range(number_of_episodes):
            #### SUGGESTED CHANGE ####
            # First set the parameters and then reset for them to take effect.
            env.set_dynamics_parameters(params)
            obs = env.reset()
            p = env.get_dynamics_parameters()
            #######
            # obs = env.reset()
            # env.set_dynamics_parameters(params)  # set parameters after reset

            if up:
                params_list = query_params(env, randomised_only=True)
                if Projection and proj_net is not None:
                    params_list = proj_net.get_context(params_list)
                obs = np.concatenate((params_list, obs))

            episode_return = 0.0
            done = False
            while not done:
                action = policy.get_action(obs)
                obs, reward, done, info = env.step(action)
                episode_return += reward

                if up:
                    obs = np.concatenate((params_list, obs))

                if (done):
                    mean_return += episode_return
                    success += int(info['success'])
                    break

        mean_return /= number_of_episodes
        success_rate = float(success) / float(number_of_episodes)
        # print('the mean return is {}'.format(mean_return))
        return_per_test_environment.append(mean_return)
        success_per_test_environment.append(success_rate)

    #### SUGGESTED CHANGE ####
    # Only need to call randomisation_on, this will restore the previous parameters and the
    # randoisation too. Note that this will only take effect in the next reset, where parameters
    # are going to be randomised again
    env.randomisation_on()
    #####

    # env.set_dynamics_parameters(initial_dynamics_params)  # for non-randomisation cases, need to set original dynamics parameters
    print('total_average_return_over_test_environments : {}'.format(
        np.mean(return_per_test_environment)))
    return np.mean(return_per_test_environment), np.mean(
        success_per_test_environment)
def main():
    render = True

    # Parameters
    horizon = 50
    total_repeats = 50

    state_dim = 6
    action_dim = 3
    env_name = 'SawyerReach'

    ### Prepare Environment #####
    env = make(
        'SawyerReach',
        gripper_type="PushingGripper",
        parameters_to_randomise=[],
        randomise_initial_conditions=False,
        table_full_size=(0.8, 1.6, 0.719),
        use_camera_obs=False,
        use_object_obs=True,
        reward_shaping=True,
        use_indicator_object=False,
        has_renderer=render,
        has_offscreen_renderer=False,
        render_collision_mesh=False,
        render_visual_mesh=True,
        control_freq=10,
        horizon=100,
        ignore_done=False,
        camera_name="frontview",
        camera_height=256,
        camera_width=256,
        camera_depth=False,
        pid=True,
        success_radius=0.01
    )

    env = FlattenWrapper(GymWrapper(EEFXVelocityControl(env, dof=3,max_action = 0.1),),keys='task-state',add_info= True)
    env._name = env_name
    env.reset()


    for policy_idx in range(1):
        ##### SETTING UP THE POLICY #########
        method = ['Single', 'LSTM', 'EPI', 'UPOSI'][policy_idx]
        if(method == 'Single'):
            alg_idx = 1
        elif(method == 'LSTM'):
            alg_idx = 2
        elif(method == 'UPOSI'):
            alg_idx = 3
            osi_l = 5
            RANDOMISZED_ONLY = True
            DYNAMICS_ONLY = True
            CAT_INTERNAL = True
            params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY)
            params_dim = params.shape[0] # dimension of parameters for prediction
            if CAT_INTERNAL:
                internal_state_dim = env.get_internal_state_dimension()
                _, _, _, info = env.step(np.zeros(action_dim))
                internal_action_dim = np.array(info["joint_velocities"]).shape[0]
                osi_input_dim = osi_l*(state_dim+action_dim+internal_state_dim+internal_action_dim)
            state_dim+=params_dim
        elif(method == 'EPI'):
            alg_idx = 4
            embed_dim = 10
            traj_l = 10
            NO_RESET = True
            embed_input_dim = traj_l*(state_dim+action_dim)
            ori_state_dim = state_dim
            state_dim += embed_dim
        else:
            continue

        alg_name = ['sac', 'td3', 'lstm_td3', 'uposi_td3', 'epi_td3'][alg_idx]
        if method == 'EPI' or method == 'UPOSI':
            randomisation_idx_range = 1
        else:
            randomisation_idx_range = 4
        for randomisation_idx in range(1,2):#randomisation_idx_range
            for goal_idx, goal_pos in enumerate(REACH_GOALS):
                goal_idx = goal_idx+1
                ###### SETTING UP THE ENVIRONMENT ######
                randomisation_params = [reach_full_randomisation, [], reach_force_randomisation, reach_force_noise_randomisation]
                env.change_parameters_to_randomise(randomisation_params[randomisation_idx])
                env._set_goal_neutral_offset(*goal_pos)
                env.reset()

                if (render):
                    env.viewer.set_camera(camera_id=0)
                    env.render()

                base_pos_in_world = env.sim.data.get_body_xpos("base")
                base_rot_in_world = env.sim.data.get_body_xmat("base").reshape((3, 3))
                base_pose_in_world = T.make_pose(base_pos_in_world, base_rot_in_world)
                world_pose_in_base = T.pose_inv(base_pose_in_world)

                # choose which randomisation is applied
                randomisation_type = ['reach_full-randomisation', 'reach_no-randomisation', \
                                      'reach_force-randomisation', 'reach_force-&-noise-randomisation'][randomisation_idx]
                number_random_params = [14, 0, 1, 3][randomisation_idx]
                folder_path = '../../../../sawyer/src/sim2real_dynamics_sawyer/assets/rl/'+method +'/' + alg_name + '/model/'
                path = folder_path + env_name + str(
                    number_random_params) + '_' + alg_name
                try:
                    policy = load(path=path, alg=alg_name, state_dim=state_dim,
                                    action_dim=action_dim)
                    if method == 'UPOSI':
                        osi_model = load_model(model_name='osi', path=path, input_dim = osi_input_dim, output_dim=params_dim )
                    elif method == 'EPI':
                        embed_model = load_model(model_name='embedding', path=path, input_dim = embed_input_dim, output_dim = embed_dim )
                        embed_model.cuda()
                        epi_policy_path = folder_path + env_name + str(number_random_params) + '_' + 'epi_ppo_epi_policy'
                        epi_policy = load(path=epi_policy_path, alg='ppo', state_dim=ori_state_dim, action_dim=action_dim )

                except :
                    print(method,',',randomisation_type)
                    continue

                if (alg_name == 'lstm_td3'):
                    hidden_out = (torch.zeros([1, 1, 512], dtype=torch.float).cuda(), \
                                  torch.zeros([1, 1, 512],
                                              dtype=torch.float).cuda())  # initialize hidden state for lstm, (hidden, cell), each is (layer, batch, dim)
                    last_action = np.array([0, 0, 0])
                ######################################

                log_save_name = '{}_{}_{}_{}'.format(method, alg_name, randomisation_type, number_random_params)

                for repeat in range(total_repeats):
                    #Reset environment
                    obs = env.reset()
                    if (render):
                        env.viewer.set_camera(camera_id=0)
                        env.render()

                    #Establish extra frame transforms
                    base_rot_in_eef = env.init_right_hand_orn.T

                    #Setup logger
                    log_path = '../../../../data/reaching/sim/new_runs/{}/goal_{}/trajectory_log_{}.csv'.format(log_save_name,goal_idx,repeat)

                    log_list = ["step", "time",
                                "cmd_eef_vx", "cmd_eef_vy", "cmd_eef_vz",
                                "eef_x", "eef_y", "eef_z",
                                "eef_vx", "eef_vy", "eef_vz",
                                "goal_x", "goal_y", "goal_z",
                                "obs_0", "obs_1", "obs_2",
                                "obs_3", "obs_4", "obs_5"
                                ]

                    logger = Logger(log_list, log_path, verbatim=render)

                    i = 0
                    mujoco_start_time = env.sim.data.time

                    if (alg_name == 'uposi_td3'):
                        uposi_traj = []
                        # params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY)
                        zero_osi_input = np.zeros(osi_input_dim)
                        pre_params = osi_model(zero_osi_input).detach().numpy()
                        params_state = np.concatenate((pre_params, obs))
                    elif (alg_name == 'epi_td3'):
                        if NO_RESET:

                            i=traj_l-1

                            traj, [last_obs, last_state] = EPIpolicy_rollout(env, epi_policy, obs,
                                                                             mujoco_start_time=mujoco_start_time,
                                                                             logger=logger, data_grabber=grab_data,
                                                                             max_steps=traj_l,
                                                                             params=None)  # only one traj; pass in params to ensure it's not changed
                            state_action_in_traj = np.array(traj)[:, :-1]  # remove the rewards
                            embedding = embed_model(state_action_in_traj.reshape(-1))
                            embedding = embedding.detach().cpu().numpy()


                            obs = last_obs  # last observation
                            env.set_state(last_state)  # last underlying state
                        else:

                            traj, [last_obs, last_state] = EPIpolicy_rollout(env, epi_policy, obs,
                                                                             mujoco_start_time=mujoco_start_time,
                                                                             logger=None, data_grabber=None,
                                                                             max_steps=traj_l,
                                                                             params=None)  # only one traj; pass in params to ensure it's not changed
                            state_action_in_traj = np.array(traj)[:, :-1]  # remove the rewards
                            embedding = embed_model(state_action_in_traj.reshape(-1))
                            embedding = embedding.detach().cpu().numpy()




                            params = env.get_dynamics_parameters()
                            env.randomisation_off()
                            env.set_dynamics_parameters(params) # same as the rollout env
                            obs =  env.reset() #Reset to make the params take effect
                            env.randomisation_on()

                        obs=np.concatenate((obs, embedding))


                    while (True):

                        mujoco_elapsed = env.sim.data.time - mujoco_start_time

                        #### CHOOSING THE ACTION #####
                        if (alg_name == 'lstm_td3'):
                            hidden_in = hidden_out
                            action, hidden_out = policy.get_action(obs, last_action, hidden_in, noise_scale=0.0)
                            last_action = action
                        elif (alg_name == 'uposi_td3'):
                            # using internal state or not
                            if CAT_INTERNAL:
                                internal_state = env.get_internal_state()
                                full_state = np.concatenate([obs, internal_state])
                            else:
                                full_state = obs
                            action = policy.get_action(params_state, noise_scale=0.0)
                        else:
                            action = policy.get_action(obs, noise_scale=0.0)
                        ##############################

                        next_obs, reward, done, info = env.step(action)
                        if (alg_name == 'uposi_td3'):
                            if CAT_INTERNAL:
                                target_joint_action = info["joint_velocities"]
                                full_action = np.concatenate([action, target_joint_action])
                            else:
                                full_action = action

                            uposi_traj.append(np.concatenate((full_state, full_action)))

                            if len(uposi_traj)>=osi_l:
                                osi_input = stack_data(uposi_traj, osi_l)
                                pre_params = osi_model(osi_input).detach().numpy()
                            else:
                                zero_osi_input = np.zeros(osi_input_dim)
                                pre_params = osi_model(zero_osi_input).detach().numpy()

                            next_params_state = np.concatenate((pre_params, next_obs))
                            params_state = next_params_state
                        elif (alg_name == 'epi_td3'):
                            next_obs=np.concatenate((next_obs, embedding))


                        #Grab logging data
                        eef_pos_in_base, eef_vel_in_base, goal_pos_in_base = grab_data(info, world_pose_in_base)
                        action_in_base = base_rot_in_eef.dot(action)

                        logger.log(i, mujoco_elapsed,
                                   action_in_base[0], action_in_base[1], action_in_base[2],
                                   eef_pos_in_base[0], eef_pos_in_base[1], eef_pos_in_base[2],
                                   eef_vel_in_base[0], eef_vel_in_base[1], eef_vel_in_base[2],
                                   goal_pos_in_base[0], goal_pos_in_base[1], goal_pos_in_base[2],
                                   obs[0], obs[1], obs[2],
                                   obs[3], obs[4], obs[5],
                                   )

                        obs = next_obs

                        if(render):
                            env.render()

                        i += 1

                        if (i >= horizon):
                            break


    env.close()
def main():
    render = False

    # Parameters
    total_repeats = 50


    for env_name in ['SawyerSlide', 'SawyerReach','SawyerPush' ]: #, 'SawyerReach', 'SawyerSlide'
        for use_white_noise in [ False]: # True
            if(not use_white_noise):
                parameter_predictions = []
                oracle_parameters = []

            if(env_name == 'SawyerReach'):
                state_dim = 6
                action_dim = 3
                horizon = 50
                task = 'reaching'
            elif(env_name == 'SawyerPush'):
                horizon = 80
                state_dim = 10
                action_dim = 2
                task = 'pushing'
            elif(env_name == 'SawyerSlide'):
                horizon = 60
                state_dim = 12
                action_dim = 2
                task = 'sliding'

            env = make_env(task,render)
            env.reset()

            osi_l = 5

            method = 'UPOSI'

            RANDOMISZED_ONLY = True
            DYNAMICS_ONLY = True
            CAT_INTERNAL = True
            if (task == 'sliding'):
                CAT_INTERNAL = False

            params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY)
            params_dim = params.shape[0]  # dimension of parameters for prediction
            if CAT_INTERNAL:
                internal_state_dim = env.get_internal_state_dimension()
                _, _, _, info = env.step(np.zeros(action_dim))
                internal_action_dim = np.array(info["joint_velocities"]).shape[0]
                osi_input_dim = osi_l * (state_dim + action_dim + internal_state_dim + internal_action_dim)
            else:
                osi_input_dim = osi_l * (state_dim + action_dim)

            state_dim += params_dim

            alg_name = 'uposi_td3'

            if(task == 'reaching'):
                GOALS = REACH_GOALS
            elif(task == 'pushing'):
                GOALS = PUSH_GOALS
            elif(task == 'sliding'):
                GOALS = SLIDE_GOALS

            for goal_idx, goal_pos in enumerate(GOALS):
                goal_idx = goal_idx+1
                ###### SETTING UP THE ENVIRONMENT ######
                if(task == 'reaching'):
                    env._set_goal_neutral_offset(*goal_pos)
                elif(task == 'pushing'):
                    start_pos = np.array([0., -16.75e-2])
                    env._set_gripper_neutral_offset(*start_pos)
                    env._set_goal_neutral_offset(*goal_pos)

                env.reset()

                if (render):
                    env.viewer.set_camera(camera_id=0)
                    env.render()

                base_pos_in_world = env.sim.data.get_body_xpos("base")
                base_rot_in_world = env.sim.data.get_body_xmat("base").reshape((3, 3))
                base_pose_in_world = T.make_pose(base_pos_in_world, base_rot_in_world)
                world_pose_in_base = T.pose_inv(base_pose_in_world)

                # choose which randomisation is applied
                randomisation_type = 'reach_full-randomisation'
                if (task == 'reaching'):
                    number_random_params = 14
                elif (task == 'pushing'):
                    number_random_params = 23
                elif (task == 'sliding'):
                    number_random_params = 22

                path = '../../../../sawyer/src/sim2real_dynamics_sawyer/assets/rl/'+method +'/' + alg_name + '/model/' + env_name + str(
                    number_random_params) + '_' + alg_name
                try:
                    policy = load(path=path, alg=alg_name, state_dim=state_dim,
                                  action_dim=action_dim)

                    if(not use_white_noise):
                        osi_model = load_model(model_name='osi', path=path, input_dim=osi_input_dim,
                                               output_dim=params_dim)
                except :
                    print(method,',',randomisation_type)

                    continue

                log_save_name = '{}_{}_{}_{}'.format(method, alg_name, randomisation_type, number_random_params)

                for repeat in range(total_repeats):
                    #Reset environment
                    obs = env.reset()

                    #Establish extra frame transforms
                    if(task != 'sliding'):
                        base_rot_in_eef = env.init_right_hand_orn.T

                    #Setup logger
                    if(use_white_noise):
                        noise_folder = 'noise'
                    else:
                        noise_folder = 'normal'

                    log_path = '../../../../data/uposi_ablation/{}/{}/{}/goal_{}/trajectory_log_{}.csv'.format(task, noise_folder,  log_save_name,goal_idx,repeat)

                    if (task == 'reaching'):
                        log_list = ["step", "time",
                                    "cmd_eef_vx", "cmd_eef_vy", "cmd_eef_vz",
                                    "eef_x", "eef_y", "eef_z",
                                    "eef_vx", "eef_vy", "eef_vz",
                                    "goal_x", "goal_y", "goal_z",
                                    "obs_0", "obs_1", "obs_2",
                                    "obs_3", "obs_4", "obs_5"
                                    ]

                    elif (task == 'pushing'):
                        log_list = ["step", "time",
                                    "cmd_eef_vx", "cmd_eef_vy",
                                    "goal_x", "goal_y", "goal_z",
                                    "eef_x", "eef_y", "eef_z",
                                    "eef_vx", "eef_vy", "eef_vz",
                                    "object_x", "object_y", "object_z",
                                    "object_vx", "object_vy", "object_vz",
                                    "z_angle",
                                    "obs_0", "obs_1", "obs_2",
                                    "obs_3", "obs_4", "obs_5",
                                    "obs_6", "obs_7", "obs_8", "obs_9"]
                    elif (task == 'sliding'):
                        log_list = ["step", "time",
                                    "cmd_j5", "cmd_j6",
                                    "obj_x", "obj_y", "obj_z",
                                    "sin_z", "cos_z",
                                    "obj_vx", "obj_vy", "obj_vz",
                                    "a_j5", "a_j6",
                                    "v_j5", "v_j6",
                                    ]

                    logger = Logger(log_list, log_path, verbatim=render)

                    i = 0
                    mujoco_start_time = env.sim.data.time

                    if(use_white_noise):
                        params = np.random.uniform(-1.,1.,size =(params_dim,))
                        params_state = np.concatenate((params, obs))
                    else:
                        epi_traj = []
                        params = query_params(env, randomised_only=RANDOMISZED_ONLY, dynamics_only=DYNAMICS_ONLY)
                        zero_osi_input = np.zeros(osi_input_dim)
                        pre_params = osi_model(zero_osi_input).detach().numpy()

                        oracle_parameters.append(params)
                        parameter_predictions.append(pre_params)

                        params_state = np.concatenate((pre_params, obs))


                    while (True):

                        mujoco_elapsed = env.sim.data.time - mujoco_start_time

                        #### CHOOSING THE ACTION #####

                        if CAT_INTERNAL:
                            internal_state = env.get_internal_state()
                            full_state = np.concatenate([obs, internal_state])
                        else:
                            full_state = obs

                        action = policy.get_action(params_state, noise_scale=0.0)

                        ##############################

                        try:
                            next_obs, reward, done, info = env.step(action)
                        except MujocoException():
                            print ('Mujoco exceptiop')

                        if (use_white_noise):
                            params = np.random.uniform(-1., 1., size = (params_dim,))
                            next_params_state = np.concatenate((params, next_obs))
                            params_state = next_params_state
                        else:
                            if CAT_INTERNAL:
                                target_joint_action = info["joint_velocities"]
                                full_action = np.concatenate([action, target_joint_action])
                            else:
                                full_action = action
                            epi_traj.append(np.concatenate((full_state, full_action)))

                            if len(epi_traj)>=osi_l:
                                osi_input = stack_data(epi_traj, osi_l)
                                pre_params = osi_model(osi_input).detach().numpy()
                            else:
                                zero_osi_input = np.zeros(osi_input_dim)
                                pre_params = osi_model(zero_osi_input).detach().numpy()

                            oracle_parameters.append(params)
                            parameter_predictions.append(pre_params)

                            next_params_state = np.concatenate((pre_params, next_obs))
                            params_state = next_params_state



                        if(task == 'reaching'):
                            # Grab logging data
                            eef_pos_in_base, eef_vel_in_base, goal_pos_in_base = grab_reach_data(info, world_pose_in_base)
                            action_in_base = base_rot_in_eef.dot(action)
                            logger.log(i, mujoco_elapsed,
                                       action_in_base[0], action_in_base[1], action_in_base[2],
                                       eef_pos_in_base[0], eef_pos_in_base[1], eef_pos_in_base[2],
                                       eef_vel_in_base[0], eef_vel_in_base[1], eef_vel_in_base[2],
                                       goal_pos_in_base[0], goal_pos_in_base[1], goal_pos_in_base[2],
                                       obs[0], obs[1], obs[2],
                                       obs[3], obs[4], obs[5],
                                       )
                        elif (task == 'pushing'):
                            goal_pos_in_base, eef_pos_in_base, eef_vel_in_base, \
                            object_pos_in_base, object_vel_in_base, z_angle, = grab_push_data(info, world_pose_in_base)

                            action_3d = np.concatenate([action, [0.0]])
                            action_3d_in_base = base_rot_in_eef.dot(action_3d)

                            logger.log(i, mujoco_elapsed,
                                       action_3d_in_base[0], action_3d_in_base[1],
                                       goal_pos_in_base[0], goal_pos_in_base[1], goal_pos_in_base[2],
                                       eef_pos_in_base[0], eef_pos_in_base[1], eef_pos_in_base[2],
                                       eef_vel_in_base[0], eef_vel_in_base[1], eef_vel_in_base[2],
                                       object_pos_in_base[0], object_pos_in_base[1], object_pos_in_base[2],
                                       object_vel_in_base[0], object_vel_in_base[1], object_vel_in_base[2],
                                       z_angle[0],
                                       obs[0], obs[1], obs[2],
                                       obs[3], obs[4], obs[5],
                                       obs[6], obs[7], obs[8],
                                       obs[9],
                                       )
                        elif (task == 'sliding'):
                            logger.log(i, mujoco_elapsed,
                                       action[0], action[1],
                                       obs[0], obs[1], obs[2],
                                       obs[3], obs[4], obs[5], obs[6],
                                       obs[7], obs[8], obs[9],
                                       obs[10], obs[11],
                                       )

                        obs = next_obs

                        if(render):
                            env.render()

                        i += 1
                        if (i >= horizon):
                            break


            if(not use_white_noise):
                parameter_predictions = np.array(parameter_predictions)
                oracle_parameters = np.array(oracle_parameters)
                percent_diff = np.abs((parameter_predictions-oracle_parameters)/2.)*100

                average_percent_diff = np.nanmean(percent_diff[np.isfinite(percent_diff)])
                print('{} percent diff :{}'.format(task,average_percent_diff))
                save_path = '../../../../data/uposi_ablation/{}/{}/'.format(task,noise_folder)
                if(os.path.exists(save_path)):
                    file = open(os.path.join(save_path, 'percent_diff.txt'), 'w')
                else:
                    file = open(os.path.join(save_path, 'percent_diff.txt'), 'a')
                file.write('{} percent diff :{}'.format(task,average_percent_diff))
                file.close()

    env.close()