Ejemplo n.º 1
0
def online_history_collection(OSImodel, env_name='SawyerReach', proj_net=None, policy=None, length=3, \
    itr=30, max_steps=30, params_dim = 37, hidden_dim=512, PRED_PARAM=False, SIspace='end', selected_joints=[0]):
    """ collect random simulation parameters and trajetories with universal policy 
    https://arxiv.org/abs/1702.02453 (Preparing for the Unknown: Learning a Universal Policy with Online System Identification)
    """
    env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(
        env_name)
    action_space = env.action_space
    ini_state_space = env.observation_space
    state_space = spaces.Box(
        -np.inf, np.inf, shape=(ini_state_space.shape[0] +
                                params_dim, ))  # add the dynamics param dim

    if policy is None:  # load off-line policy if no policy
        policy = TD3_PolicyNetwork(state_space, action_space,
                                   hidden_dim).cuda()

    params_list = []
    history = []
    for eps in range(itr):  # K
        state = env.reset()
        # params = query_params(env)
        params = query_key_params(env)
        epi_traj = []
        params_list.append(params)

        # N is 1 in this implementation, as each env.reset() will have different parameter set

        for step in range(max_steps):  # T
            if len(epi_traj) >= length and PRED_PARAM:
                osi_input = stack_data(
                    epi_traj, length
                )  # stack (s,a) to have same length as in the model input
                pre_params = OSImodel(osi_input).detach().numpy()
            else:
                pre_params = params

            if proj_net is not None:  # projected to low dimensions
                pre_params = proj_net.get_context(pre_params)
            params_state = np.concatenate(
                (pre_params, state)
            )  # use predicted parameters instead of true values for training, according to the paper
            action = policy.get_action(params_state)
            next_state, _, _, info = env.step(action)
            if SIspace == 'end':
                epi_traj.append(np.concatenate((state, action)))
            elif SIspace == 'joint':
                epi_traj.append(
                    np.concatenate(
                        (env._joint_positions[selected_joints],
                         info['joint_velocities'][selected_joints])))

            state = next_state
        history.append(np.array(epi_traj))
    print("Finished collecting data of {} trajectories.".format(itr))
    return params_list, history
Ejemplo n.º 2
0
def test(env_name='SawyerReach',
         state_dim=6,
         action_dim=3,
         length=3,
         params_dim=4,
         path='./osi',
         SIspace='end',
         selected_joints=[0]):
    env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(
        env_name)
    if SIspace == 'end':
        data_dim = length * (state_dim + action_dim)
        save_path = 'osi_end'
    elif SIspace == 'joint':
        joint_dim = len(selected_joints)
        data_dim = length * (
            joint_dim + joint_dim
        )  # for joint space, 'state' is joint position, 'action' is joint velocity
        save_path = 'osi_joint'
    osi_model = OSINetwork(input_dim=data_dim, output_dim=params_dim)

    osi_model.load_state_dict(torch.load(save_path))
    osi_model.eval()
    policy = RandomPolicy(action_dim=action_dim)

    for eps in range(10):
        state = env.reset()
        # params = query_params(env)
        params = query_key_params(env)
        epi_traj = []
        print('true params: ', params)

        for step in range(30):
            if len(epi_traj) >= length:
                osi_input = stack_data(
                    epi_traj, length
                )  # stack (s,a) to have same length as in the model input
                pre_params = osi_model(osi_input).detach().numpy()
                print('predicted params: ', pre_params)

            action = policy.get_action(state)
            next_state, _, _, info = env.step(action)
            if SIspace == 'end':
                epi_traj.append(np.concatenate((state, action)))
            elif SIspace == 'joint':
                epi_traj.append(
                    np.concatenate(
                        (env._joint_positions[selected_joints],
                         info['joint_velocities'][selected_joints])))

            state = next_state
Ejemplo n.º 3
0
def offline_history_collection(env_name,
                               itr=30,
                               max_steps=30,
                               policy=None,
                               params_dim=37,
                               SIspace='end',
                               selected_joints=[0]):
    """ collect random simulation parameters and trajetories with given policy """
    env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(
        env_name)
    action_space = env.action_space
    state_space = env.observation_space
    if policy is None:  # load off-line policy is no policy
        policy = DPG_PolicyNetwork(state_space, action_space, 512).cuda()

    history = []
    params_list = []
    history = []
    for epi in range(itr):
        state = env.reset()
        # params = query_params(env)
        params = query_key_params(env)
        epi_traj = []
        params_list.append(params)
        for step in range(max_steps):
            action = policy.get_action(state)
            next_state, _, _, info = env.step(action)
            if SIspace == 'end':
                epi_traj.append(np.concatenate((state, action)))
            elif SIspace == 'joint':
                epi_traj.append(
                    np.concatenate(
                        (env._joint_positions[selected_joints],
                         info['joint_velocities'][selected_joints])))

            state = next_state
        history.append(np.array(epi_traj))
    print("Finished collecting data.")
    return params_list, history
def main():
    # reproducible
    # env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    torch.manual_seed(RANDOMSEED)

    env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(
        ENV_NAME)

    state_space = env.observation_space
    action_space = env.action_space

    ppo = PPO(state_space, action_space, hidden_dim=512)

    if args.train:
        ppo.share_memory()
        rewards_queue = mp.Queue(
        )  # used for get rewards from all processes and plot the curve
        eval_rewards_queue = mp.Queue(
        )  # used for get offline evaluated rewards from all processes and plot the curve
        success_queue = mp.Queue(
        )  # used for get success events from all processes
        eval_success_queue = mp.Queue()
        processes = []
        rewards = []
        success = []
        eval_rewards = []
        eval_success = []

        for i in range(NUM_WORKERS):
            process = Process(target=worker, args=(i, ppo, environment_params, environment_wrappers,environment_wrapper_arguments,\
                rewards_queue, eval_rewards_queue, success_queue, eval_success_queue,))  # the args contain shared and not shared
            process.daemon = True  # all processes closed when the main stops
            processes.append(process)

        [p.start() for p in processes]
        while True:  # keep geting the episode reward from the queue
            # r = rewards_queue.get()
            # succ = success_queue.get()
            eval_r = eval_rewards_queue.get(
            )  # this queue has different sample frequence with above two queues, .get() at same time will break the while loop
            eval_succ = eval_success_queue.get()

            # success.append(succ)
            # rewards.append(r)
            eval_rewards.append(eval_r)
            eval_success.append(eval_succ)

            if len(eval_rewards) % 20 == 0 and len(eval_rewards) > 0:
                # plot(rewards)
                # np.save(prefix+'td3_rewards', rewards)
                # np.save(prefix+'td3_success', success)
                np.save(prefix + 'eval_rewards', eval_rewards)
                np.save(prefix + 'eval_success', eval_success)

        [p.join() for p in processes]  # finished at the same time

        ppo.save_model(MODEL_PATH)

    if args.test:
        ppo.load_model(MODEL_PATH)
        ppo.to_cuda()
        while True:
            s = env.reset()
            for i in range(EP_LEN):
                env.render()
                a = ppo.choose_action(s, True)
                s, r, done, _ = env.step(a)
                if done:
                    break
    eval_interval = 100
    # load other default parameters
    [action_range, batch_size, explore_steps, update_itr, explore_noise_scale, eval_noise_scale, reward_scale, \
        hidden_dim, noise_decay, policy_target_update_interval, q_lr, policy_lr, replay_buffer_size, DETERMINISTIC] = \
            load_params('td3', ['action_range', 'batch_size', 'explore_steps', 'update_itr', 'explore_noise_scale',\
             'eval_noise_scale', 'reward_scale', 'hidden_dim', 'noise_decay', \
                 'policy_target_update_interval', 'q_lr', 'policy_lr','replay_buffer_size', 'deterministic'] )

    # the replay buffer is a class, have to use torch manager to make it a proxy for sharing across processes
    BaseManager.register('ReplayBuffer', ReplayBuffer)
    manager = BaseManager()
    manager.start()
    replay_buffer = manager.ReplayBuffer(
        replay_buffer_size)  # share the replay buffer through manager

    env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(
        env_name)
    prefix = env_name + str(len(environment_params["parameters_to_randomise"])
                            )  # number of randomised parameters
    model_path = '../../../../../data/uposi_td3/model/' + prefix + '_uposi_td3'

    params = query_params(env,
                          randomised_only=RANDOMISZED_ONLY,
                          dynamics_only=DYNAMICS_ONLY)
    params_dim = params.shape[0]  # dimension of parameters for prediction
    print('Dimension of parameters for prediction: {}'.format(params_dim))
    action_space = env.action_space
    ini_state_space = env.observation_space
    state_space = spaces.Box(
        -np.inf, np.inf, shape=(ini_state_space.shape[0] +
                                params_dim, ))  # add the dynamics param dim
    if CAT_INTERNAL:
    def __init__(self, env_name='SawyerReach', length=3, context_dim=3, Projection=True, CAT_INTERNAL=False):
        self.cat_internal = CAT_INTERNAL
        env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(env_name)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        print('Env name: ', env_name)
        print('Dimension of env state: ', state_dim)
        print('Dimension of env action: ', action_dim)
        self.params_dim = env.get_randomised_parameter_dimensions()
        print('Dimension of randomised parameters: ', self.params_dim)
        data_dim = length*(state_dim+action_dim)
        if CAT_INTERNAL:
            internal_state_dim = env.get_internal_state_dimension()
            print('Dimension of internal state: ', internal_state_dim)
            data_dim = length*(state_dim+action_dim+internal_state_dim)
        else:
            data_dim = length*(state_dim+action_dim)
        self.osi_model = OSINetork(input_dim = data_dim, output_dim = self.params_dim)
        self.env_name = env_name
        self.length = length  # trajectory length for prediction

        if Projection:
            self.proj_net = load_model(path = '../../../../data/pup_td3/model/pup_td3_projection', input_dim=self.params_dim, output_dim=context_dim)
            self.policy=load(path = '../../../../data/pup_td3/model/pup_td3', alg='TD3', state_dim = state_dim+context_dim, action_dim = action_dim)
            self.save_path = '../../../../../data/pup_td3/model/osi'
            
        else:
            self.proj_net = None
            self.policy=load(path = '../../../../data/up_td3/model/up_td3', alg='TD3', state_dim = state_dim+self.params_dim, action_dim = action_dim)
            self.save_path = '../../../../../data/up_td3/model/osi'
    def online_history_collection(self, itr=30, max_steps=30, PRED_PARAM=False, CAT_INTERNAL=False):
        """ collect random simulation parameters and trajetories with universal policy 
        https://arxiv.org/abs/1702.02453 (Preparing for the Unknown: Learning a Universal Policy with Online System Identification)
        """
        env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(self.env_name)
        action_space = env.action_space
        ini_state_space = env.observation_space
        state_space = spaces.Box(-np.inf, np.inf, shape=(ini_state_space.shape[0]+self.params_dim, ))  # add the dynamics param dim

        # a random policy
        data_collection_policy=DPG_PolicyNetwork(state_space, action_space, hidden_dim=512).cuda()

        params_list=[]
        history=[]
        for eps in range(itr):  # K
            state = env.reset()
            params = query_params(env, randomised_only=True)
            epi_traj = []
            params_list.append(params)

            # N is 1 in this implementation, as each env.reset() will have different parameter set

            for step in range(max_steps):  # T
                if CAT_INTERNAL:
                    internal_state = env.get_internal_state()
                    full_state = np.concatenate([state, internal_state])
                else:
                    full_state = state
                if len(epi_traj)>=self.length and PRED_PARAM:
                    osi_input = stack_data(epi_traj, self.length)  # stack (s,a) to have same length as in the model input
                    pre_params = self.osi_model(osi_input).detach().numpy()
                else:
                    pre_params = params

                if self.proj_net is not None:  # projected to low dimensions
                    pre_params = self.proj_net.get_context(pre_params)
                else:
                    pass
                    # print('No projection network!')
                params_state = np.concatenate((pre_params, state))   # use predicted parameters instead of true values for training, according to the paper
                action = data_collection_policy.get_action(params_state)
                epi_traj.append(np.concatenate((full_state, action)))

                next_state, _, _, _ = env.step(action)
                state = next_state
            history.append(np.array(epi_traj))
        print("Finished collecting data of {} trajectories.".format(itr))
        return params_list, history
Ejemplo n.º 8
0
def offline_history_collection(env_name, itr=30, policy=None, \
    vectorize=True, discrete=False, vine=False, vine_sample_size=500, egreedy=0):
    """ 
    Collect random simulation parameters and trajetories with given policy.
    ----------------------------------------------------------------
    params:
    env_name: name of env to collect data from
    itr: data episodes
    policy: policy used for collecting data
    vectorize: vectorized parameters into a list rather than a dictionary, used for system identification
    discrete: discrete randomisation range, as in EPI paper
    vine: Vine data collection, same state and same action at the initial of trajectory, as in EPI paper 
    vine_sample_size: number of state action samples in vine trajectory set
    egreedy: the factor for collecting data with epsilon-greedy policy
    """
    env, environment_params, environment_wrappers, environment_wrapper_arguments = choose_env(env_name)
    action_space = env.action_space
    state_space = env.observation_space
    if policy is None:  # load off-line policy if no policy
        policy=DPG_PolicyNetwork(state_space, action_space, 512).cuda()
        # load from somewhere
    
    history_sa=[]
    history_s_=[]
    params_list=[]
    if vine:
        vine_state_set = []  # underlying state of env, not the observation
        vine_action_set = [] # initial action after initial state
        vine_idx = 0
        # collect state action sets according to EPI's vine implementation
        while vine_idx<vine_sample_size:
            state =  env.reset()
            while vine_idx<vine_sample_size:
                if np.random.rand() < egreedy:
                    action = env.action_space.sample()
                else:
                    action = policy.get_action(state)
                vine_state_set.append(env.get_state())
                vine_action_set.append(action)  
                vine_idx += 1
                next_state, _, done, _ = env.step(action)
                state = next_state

                if done: break

    print('Start collecting transitions.')
    env.ignore_done = True
    for epi in range(itr):
        print('Episode: {}'.format(epi))
        state = env.reset()
        env.randomisation_off()
        # reset_params = env.get_dynamics_parameters()
        if discrete:
            env.randomisation_on()  # as sample_discretized_env_parameters() needs randomisation ranges
            sampled_env_params_dict = sample_discretized_env_parameters(env)
            env.randomisation_off()
            env.set_dynamics_parameters(sampled_env_params_dict)
        if vectorize:
            env.randomisation_on()
            params = query_params(env)
            env.randomisation_off()
        else:
            params = env.get_dynamics_parameters()
        params_list.append(params)

        if vine:
            epi_sa =[]
            epi_s_ =[]
            for underlying_state, action in zip(vine_state_set, vine_action_set):
                env.set_state(underlying_state)  # underlying state is different from obs of env
                state = _flatten_obs(env._get_observation())  # hacked
                try: 
                    next_state, _, done, _ = env.step(action)
                except MujocoException:
                    print('Data collection: MujocoException')
                    action = np.zeros_like(action)
                    next_state = state
                epi_sa.append(np.concatenate((state, action)))
                epi_s_.append(np.array(next_state))
                if done:   # keep using same env after done
                    env.reset()
            history_sa.append(np.array(epi_sa))
            history_s_.append(np.array(epi_s_))

        else:
            epi_traj = []
            for step in range(env.horizon):
                action = policy.get_action(state)
                epi_traj.append(np.concatenate((state, action)))
                try:
                    next_state, _, _, _ = env.step(action)
                except MujocoException:
                    print('MujocoException')
                    action = np.zeros(action)
                    next_state = state
                    continue   
                state = next_state
            history_sa.append(np.array(epi_traj))
        env.randomisation_on()
    if vine:
        history = [np.array(history_sa), np.array(history_s_)]
    else:
        history = np.array(history_sa)
    print("Finished collecting data.")
    return params_list, history
Ejemplo n.º 9
0
    return: parameter dictionary
    '''
    params_dict = env.get_dynamics_parameters()
    params_ranges = env.get_parameter_sampling_ranges()  # range
    params_factors = env.get_factors_for_randomisation() 
    randomized_params_list = env.get_randomised_parameters()
    for key in randomized_params_list:
        param_range = params_ranges[key]
        low = param_range[0]
        high = param_range[1]
        if isinstance(low, np.int) and isinstance(high, np.int):
            # is time-delayed parameter, already discrete in original env
            ranges = np.arrange(low, high+1)
            sampled_value = np.random.choice(ranges)
        else:
            value_range = high - low
            low = low + value_range*range_reduction_ratio
            high = high - value_range*range_reduction_ratio
            value_list = [low+((high-low)/splits)*i for i in range(splits)]
            sampled_value = params_factors[key]*np.random.choice(value_list)
        params_dict[key] = sampled_value

    return params_dict




if __name__ == '__main__':
    env, _, _, _ = choose_env('SawyerReach')
    params  =  query_params(env)
    print(params, params.shape)