Exemple #1
0
def sample_random_paths(max_samples,
                        ma_mode,
                        sampler,
                        max_path_length=np.inf,
                        scope=None):
    singleton_pool.run_collect(_worker_collect_path_random_one_env,
                               threshold=max_samples,
                               args=(max_path_length, ma_mode, sampler, scope),
                               show_prog_bar=True)
Exemple #2
0
def sample_paths(
        policy_params,
        max_samples,
        max_path_length=np.inf,
        env_params=None,
        scope=None,
        reset_arg=None,
        show_prog_bar=True,
        multi_task=False):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    if multi_task:
        assert len(policy_params) == singleton_pool.n_parallel
        all_params = [(params, scope) for params in policy_params]
        singleton_pool.run_each(
            _worker_set_policy_params,
            all_params,
        )
    else:
        singleton_pool.run_each(
            _worker_set_policy_params,
            [(policy_params, scope)] * singleton_pool.n_parallel
        )
    if env_params is not None:
        singleton_pool.run_each(
            _worker_set_env_params,
            [(env_params, scope)] * singleton_pool.n_parallel
        )

    if multi_task:
        args = [(max_path_length, scope, arg) for arg in reset_arg]
        return singleton_pool.run_collect(
            _worker_collect_one_path,
            threshold=max_samples,
            args=args,
            show_prog_bar=show_prog_bar,
            multi_task=multi_task,
        )
    else:
        return singleton_pool.run_collect(
            _worker_collect_one_path,
            threshold=max_samples,
            args=(max_path_length, scope, reset_arg),
            show_prog_bar=show_prog_bar,
            multi_task=multi_task,
        )
def sample_paths(
        policy_params,
        max_samples,
        max_path_length=np.inf,
        env_params=None,
        scope=None,
        reset_arg=None,
        show_prog_bar=True,
        multi_task=False):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    if multi_task:
        assert len(policy_params) == singleton_pool.n_parallel
        all_params = [(params, scope) for params in policy_params]
        singleton_pool.run_each(
            _worker_set_policy_params,
            all_params,
        )
    else:
        singleton_pool.run_each(
            _worker_set_policy_params,
            [(policy_params, scope)] * singleton_pool.n_parallel
        )
    if env_params is not None:
        singleton_pool.run_each(
            _worker_set_env_params,
            [(env_params, scope)] * singleton_pool.n_parallel
        )

    if multi_task:
        args = [(max_path_length, scope, arg) for arg in reset_arg]
        return singleton_pool.run_collect(
            _worker_collect_one_path,
            threshold=max_samples,
            args=args,
            show_prog_bar=show_prog_bar,
            multi_task=multi_task,
        )
    else:
        return singleton_pool.run_collect(
            _worker_collect_one_path,
            threshold=max_samples,
            args=(max_path_length, scope, reset_arg),
            show_prog_bar=show_prog_bar,
            multi_task=multi_task,
        )
Exemple #4
0
def sample_paths(policy_params,
                 max_samples,
                 max_path_length=np.inf,
                 low_policy_params=None,
                 env_params=None,
                 scope=None):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    singleton_pool.run_each(_worker_set_policy_params,
                            [(policy_params, scope)] *
                            singleton_pool.n_parallel)
    if low_policy_params is not None:
        singleton_pool.run_each(_worker_set_low_policy_params,
                                [(low_policy_params, scope)] *
                                singleton_pool.n_parallel)
    if env_params is not None:
        singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] *
                                singleton_pool.n_parallel)
    return singleton_pool.run_collect(_worker_collect_one_path,
                                      threshold=max_samples,
                                      args=(max_path_length, scope),
                                      show_prog_bar=True)
Exemple #5
0
def sample_paths(policy_params,
                 dynamics_params,
                 max_samples,
                 max_path_length=np.inf,
                 itr=None,
                 obs_mean=None,
                 obs_std=None,
                 act_mean=None,
                 act_std=None):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    singleton_pool.run_each(_worker_set_policy_params,
                            [(policy_params, )] * singleton_pool.n_parallel)

    # Set dynamics params.
    # --------------------
    singleton_pool.run_each(_worker_set_dynamics_params,
                            [(dynamics_params, )] * singleton_pool.n_parallel)
    # --------------------
    return singleton_pool.run_collect(_worker_collect_one_path,
                                      threshold=max_samples,
                                      args=(max_path_length, itr, obs_mean,
                                            obs_std, act_mean, act_std),
                                      show_prog_bar=True)
Exemple #6
0
def sample_paths(policy_params,
                 max_samples,
                 max_path_length=np.inf,
                 env_params=None,
                 scope=None,
                 useImitationPolicy=False,
                 useImitationEnv=False,
                 count_traj=False,
                 terminate_only_max_path=False):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :param count_traj: if true then max_samples is the desired maximum number of trajectories to be collected.
    :return: a list of collected paths
    """
    if not useImitationPolicy and not policy_params is None:
        singleton_pool.run_each(_worker_set_policy_params,
                                [(policy_params, scope)] *
                                singleton_pool.n_parallel)
    if env_params is not None:
        singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] *
                                singleton_pool.n_parallel)
    return singleton_pool.run_collect(
        _worker_collect_one_path,
        threshold=max_samples,
        args=(max_path_length, scope, useImitationPolicy, useImitationEnv,
              count_traj, terminate_only_max_path),
        show_prog_bar=True)
def sample_paths(
        policy_params,
        max_samples,
        max_path_length=np.inf,
        env_params=None,
        scope=None):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    singleton_pool.run_each(
        _worker_set_policy_params,
        [(policy_params, scope)] * singleton_pool.n_parallel
    )
    if env_params is not None:
        singleton_pool.run_each(
            _worker_set_env_params,
            [(env_params, scope)] * singleton_pool.n_parallel
        )
    return singleton_pool.run_collect(
        _worker_collect_one_path,
        threshold=max_samples,
        args=(max_path_length, scope),
        show_prog_bar=True
    )
Exemple #8
0
def sample_paths(
        policy_params,
        max_samples,
        max_path_length=np.inf,
        dyn_model=None,  #Updated
        env_params=None,
        scope=None,
        policy=None,  #Updated
        rau=None,  #Updated
        delta=0,  #Updated
        constraint_fn=None,  #Updated
        constraint_cost_fn=None,  #Updated
        HCMPC_Activation=False,  #Updated
        Constrained=False,  #Updated for Constrained Algorithms
):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    singleton_pool.run_each(_worker_set_policy_params,
                            [(policy_params, scope)] *
                            singleton_pool.n_parallel)
    if env_params is not None:
        singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] *
                                singleton_pool.n_parallel)


#Updated
    if (Constrained):
        return singleton_pool.run_collect(
            _worker_collect_one_path_constrained,
            threshold=max_samples,
            args=(dyn_model, max_path_length, scope, policy, rau, delta,
                  constraint_fn, constraint_cost_fn, HCMPC_Activation),
            show_prog_bar=True)
    else:
        return singleton_pool.run_collect(_worker_collect_one_path,
                                          threshold=max_samples,
                                          args=(max_path_length, scope),
                                          show_prog_bar=True)
def sample_paths(
        policy_params,
        dynamics_params,
        max_samples,
        max_path_length=np.inf,
        itr=None,
        normalize_reward=None,
        reward_mean=None,
        reward_std=None,
        kl_batch_size=None,
        n_itr_update=None,
        use_replay_pool=None,
        obs_mean=None,
        obs_std=None,
        act_mean=None,
        act_std=None,
        second_order_update=None,
        use_hide=True,
        use_hide_alg='my',
        show_rollout_chance=0.01,
        hide_tmax=10,
        mode=None
):
    """
    :param policy_params: (dict) parameters for policies. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    singleton_pool.run_each(
        _worker_set_policy_params,
        [(policy_params,)] * singleton_pool.n_parallel
    )

    # Set dynamics params.
    # --------------------
    singleton_pool.run_each(
        _worker_set_dynamics_params,
        [(dynamics_params,)] * singleton_pool.n_parallel
    )
    # --------------------
    # max_samples(params['batch_size']을 넘지 않으면서 random한 수의 path를 rollout
    return singleton_pool.run_collect(
        _worker_collect_one_path,
        threshold=max_samples,
        args=(max_path_length, itr, normalize_reward, reward_mean,
              reward_std, kl_batch_size, n_itr_update, use_replay_pool,
              obs_mean, obs_std, act_mean, act_std, second_order_update,
              use_hide, use_hide_alg, mode, show_rollout_chance, hide_tmax),
        show_prog_bar=False
    )
Exemple #10
0
def sample_paths_a2c(policy_params,
                     max_samples,
                     ma_mode,
                     max_path_length=np.inf,
                     env_params=None,
                     scope=None):
    singleton_pool.run_each(_worker_set_policy_params,
                            [(policy_params, ma_mode, scope)] *
                            singleton_pool.n_parallel)
    if env_params is not None:
        singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] *
                                singleton_pool.n_parallel)

    return singleton_pool.run_collect(_worker_collect_path_one_env_a2c,
                                      threshold=max_samples,
                                      args=(max_path_length, ma_mode, scope),
                                      show_prog_bar=True)
Exemple #11
0
def sample_paths(
        policy_params,
        dynamics_params,
        max_samples,
        max_path_length=np.inf,
        itr=None,
        normalize_reward=None,
        reward_mean=None,
        reward_std=None,
        kl_batch_size=None,
        n_itr_update=None,
        use_replay_pool=None,
        obs_mean=None,
        obs_std=None,
        act_mean=None,
        act_std=None,
        second_order_update=None
):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """
    singleton_pool.run_each(
        _worker_set_policy_params,
        [(policy_params,)] * singleton_pool.n_parallel
    )

    # Set dynamics params.
    # --------------------
    singleton_pool.run_each(
        _worker_set_dynamics_params,
        [(dynamics_params,)] * singleton_pool.n_parallel
    )
    # --------------------
    return singleton_pool.run_collect(
        _worker_collect_one_path,
        threshold=max_samples,
        args=(max_path_length, itr, normalize_reward, reward_mean,
              reward_std, kl_batch_size, n_itr_update, use_replay_pool, obs_mean, obs_std, act_mean, act_std, second_order_update),
        show_prog_bar=True
    )
Exemple #12
0
def sample_paths(policy_params,
                 max_samples,
                 ma_mode,
                 max_path_length=np.inf,
                 env_params=None,
                 scope=None):
    if ma_mode == 'concurrent':
        assert isinstance(policy_params, list)
    singleton_pool.run_each(_worker_set_policy_params,
                            [(policy_params, ma_mode, scope)] *
                            singleton_pool.n_parallel)
    if env_params is not None:
        singleton_pool.run_each(_worker_set_env_params, [(env_params, scope)] *
                                singleton_pool.n_parallel)

    return singleton_pool.run_collect(_worker_collect_path_one_env,
                                      threshold=max_samples,
                                      args=(max_path_length, ma_mode, scope),
                                      show_prog_bar=True)
def sample_paths(policy_params,
                 max_samples,
                 max_path_length=np.inf,
                 env_params=None,
                 scope=None,
                 reset_arg=None,
                 show_prog_bar=True,
                 multi_task=False,
                 extra_infos=None,
                 taskIdx=0):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths


    """

    #if preupdate:

    singleton_pool.run_each(_worker_set_policy_params,
                            [(policy_params, scope)] *
                            singleton_pool.n_parallel)

    return singleton_pool.run_collect(
        _worker_collect_one_path,
        numPaths=max_samples / max_path_length,
        args_list=[
            (max_path_length, scope, np.array(reset_arg), taskIdx, extra_infos)
        ] * singleton_pool.n_parallel,
        show_prog_bar=show_prog_bar,
        multi_task=multi_task,
    )
Exemple #14
0
def sample_paths(
        policy_params,
        max_samples,
        max_path_length=np.inf,
        env_params=None,
        scope=None,
        iter = 0,
        env = None,
        policy = None,
        baseline = None,
        sim_percentage = 1.0/3.0,
        target_task = None):
    """
    :param policy_params: parameters for the policy. This will be updated on each worker process
    :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
    might be greater since all trajectories will be rolled out either until termination or until max_path_length is
    reached
    :param max_path_length: horizon / maximum length of a single trajectory
    :return: a list of collected paths
    """


    singleton_pool.run_each(
        _worker_set_policy_params,
        [(policy_params, scope)] * singleton_pool.n_parallel
    )
    if env_params is not None:
        singleton_pool.run_each(
            _worker_set_env_params,
            [(env_params, scope)] * singleton_pool.n_parallel
        )

    if target_task is not None:
        singleton_pool.run_each(_worker_update_dyn, [('target_task',
                                                             target_task, scope)] * singleton_pool.n_parallel)

    if singleton_pool.G.ensemble_dynamics['use_ens_dyn'] and iter > 0:
        singleton_pool.run_each(_worker_update_dyn, [('dyn_model_choice',
                                                             0, scope)] * singleton_pool.n_parallel)
        result1 = singleton_pool.run_collect(
            _worker_collect_one_path,
            threshold=max_samples * (sim_percentage),
            args=(max_path_length, scope),
            show_prog_bar=True
        )

        singleton_pool.run_each(_worker_update_dyn, [('dyn_model_choice',
                                                             1, scope)] * singleton_pool.n_parallel)
        singleton_pool.run_each(_worker_update_dyn, [('base_paths',
                                                             result1, scope)] * singleton_pool.n_parallel)
        singleton_pool.run_each(_worker_update_dyn, [('baseline',
                                                             baseline, scope)] * singleton_pool.n_parallel)

        result2 = singleton_pool.run_collect(
            _worker_collect_one_path,
            threshold=max_samples * (1-sim_percentage),
            args=(max_path_length, scope),
            show_prog_bar=True
        )

        result = result1 + result2
        #result = result1
    else:
        result = singleton_pool.run_collect(
            _worker_collect_one_path,
            threshold=max_samples,
            args=(max_path_length, scope),
            show_prog_bar=True
        )

    logger.log('Collected Traj Num: '+str(len(result)))

    if 'model_parameters' in result[0]['env_infos'] and logger._snapshot_dir is not None:
        mp_rew_raw = []
        for path in result:
            mp_rew_raw.append([np.array(path['env_infos']['model_parameters'][-1]), path['rewards'].sum()])
        mp_rew_raw.sort(key=lambda x: str(x[0]))
        #print(mp_rew_raw)
        mp_rew = []
        i = 0
        while True:
            if i >= len(mp_rew_raw) - 1:
                break
            cur_mp = mp_rew_raw[i][0]
            cur_rew = mp_rew_raw[i][1]
            cur_mp_num = 1
            for j in range(i + 1, len(mp_rew_raw)):
                if (mp_rew_raw[j][0] - cur_mp).any():
                    break
                cur_rew += mp_rew_raw[j][1]
                cur_mp_num += 1
            i += cur_mp_num
            mp_rew.append([np.array(cur_mp), cur_rew * 1.0 / cur_mp_num])
        mp_rew.sort(key=lambda x: x[1])
        filename = logger._snapshot_dir + '/mp_rew_' + str(iter) + '.pkl'
        pickle.dump(mp_rew, open(filename, 'wb'))

    if singleton_pool.G.ensemble_dynamics['use_ens_dyn']:
        dyn_training_x = []
        dyn_training_y = []
        dyn_training_result = result
        if iter > 0:
            dyn_training_result = result1
        for path in dyn_training_result:
            for state_act in path['env_infos']['state_act']:
                dyn_training_x.append(state_act)
            for next_state in path['env_infos']['next_state']:
                dyn_training_y.append(next_state)
        singleton_pool.G.ensemble_dynamics['training_buffer_x'] += dyn_training_x
        singleton_pool.G.ensemble_dynamics['training_buffer_y'] += dyn_training_y
        if len(singleton_pool.G.ensemble_dynamics['training_buffer_x']) > 10000:
            singleton_pool.G.ensemble_dynamics['training_buffer_x'] = singleton_pool.G.ensemble_dynamics['training_buffer_x'][-10000:]
            singleton_pool.G.ensemble_dynamics['training_buffer_y'] = singleton_pool.G.ensemble_dynamics['training_buffer_y'][-10000:]
        if iter %1 ==0:
            optimize_iter = 100
            if iter != 0:
                optimize_iter = 5
            singleton_pool.G.ensemble_dynamics['dyn_models'][0].fit(singleton_pool.G.ensemble_dynamics['training_buffer_x'], singleton_pool.G.ensemble_dynamics['training_buffer_y'], iter = optimize_iter)
            #singleton_pool.G.ensemble_dynamics['transition_locator'].fit(singleton_pool.G.ensemble_dynamics['training_buffer_x'], singleton_pool.G.ensemble_dynamics['training_buffer_y'])
            print('fitted dynamic models and transition locator')
            singleton_pool.run_each(_worker_update_dyn, [('dyn_models',
                                                                 singleton_pool.G.ensemble_dynamics['dyn_models'], scope)] * singleton_pool.n_parallel)
            #singleton_pool.run_each(_worker_update_dyn, [('transition_locator',
            #                                                     singleton_pool.G.ensemble_dynamics['transition_locator'], scope)] * singleton_pool.n_parallel)
            if logger._snapshot_dir is not None:
                joblib.dump(singleton_pool.G.ensemble_dynamics['dyn_models'], logger._snapshot_dir+'/dyn_models.pkl', compress=True)

    # augment the data with synthetic data
        '''if iter > 0:
            logger.log('Synthetizing data...')
            bg = time.time()
            dartenv = env._wrapped_env.env.env
            dartenv.dyn_model_id = 1
            dartenv.reset()
            if env._wrapped_env.monitoring:
                dartenv = dartenv.env
            data_size = int(max_samples * (1-sim_percentage))
            random_state = []
            for i in range(data_size):
                path = result[np.random.randint(len(result))]
                state_act = path['env_infos']['state_act'][np.random.randint(len(path['env_infos']['state_act']))]
                state = state_act[0:singleton_pool.G.ensemble_dynamics['dyn_models'][0].state_dim]
                random_state.append(state + np.random.uniform(low=0.01, high = 0.01, size=len(state)))
            obs = []
            for i in range(data_size):
                dartenv.set_state_vector(random_state[i])
                obs.append(dartenv._get_obs())
            raw_actions = policy.get_actions(obs)
            actions = raw_actions[0]

            next_state = []
            for i in range(data_size):
                next_state.append(singleton_pool.G.ensemble_dynamics['dyn_models'][0].do_simulation(random_state[i], actions[i], 4))
            rewards = []
            for i in range(data_size):
                rewards.append(dartenv.get_reward(random_state[i], actions[i], next_state[i], 0.2))
            for i in range(data_size):
                newpath = {}
                newpath['rewards'] = np.array([rewards[i]])
                newpath['env_infos'] = {}
                newpath['env_infos']['dyn_model_id'] = np.array([1])
                env_info_keys = list(result[0]['env_infos'].keys())
                for key in env_info_keys:
                    if key not in newpath['env_infos']:
                        newpath['env_infos'][key] = np.copy(result[0]['env_infos'][key][[-1]])
                newpath['observations'] = np.array([obs[i]])
                newpath['actions'] = np.array([actions[i]])
                newpath['agent_infos'] = {}
                newpath['agent_infos']['log_std'] = raw_actions[1]['log_std'][[i]]
                newpath['agent_infos']['mean'] = raw_actions[1]['mean'][[i]]

                result.append(newpath)
            dartenv.dyn_model_id = 0
            ed = time.time()
            logger.log('Synthesize done, created: '+str(ed-bg))'''

    return result