Exemple #1
0
def study_cem(params, starting_pol=None) -> None:
    """
    Start a study of CEM algorithms
    :param params: the parameters of the study
    :param starting_pol: initial policy
    :return: nothing
    """
    assert params.policy_type in ['squashedGaussian', 'normal',
                                  'beta'], 'unsupported policy type'
    chrono = Chrono()
    # cuda = torch.device('cuda')
    study = params.gradients
    if params.nb_trajs_cem is not None:
        params.nb_trajs = params.nb_trajs_cem
    simu = make_simu_from_params(params)
    for i in range(1):  # len(study) Only sum here
        simu.env.set_file_name('cem' + study[i] + '_' + simu.env_name)
        print("study : ", study[i])
        for j in range(params.nb_repet):
            simu.env.reinit()
            if params.policy_type == "squashedGaussian":
                policy = SquashedGaussianPolicy(simu.obs_size, 32, 64, 1)
            elif params.policy_type == "normal":
                policy = NormalPolicy(simu.obs_size, 32, 64, 1)
            elif params.policy_type == "beta":
                policy = BetaPolicy(simu.obs_size, 32, 64, 1)
            if starting_pol is not None:
                policy.set_weights(starting_pol[j])
            pw = PolicyWrapper(policy, j, params.policy_type, simu.env_name,
                               params.team_name, params.max_episode_steps)
            # plot_policy(policy, simu.env, True, simu.env_name, study[i], '_ante_', j, plot=False)
            simu.train_cem(pw, params, policy)
            # plot_policy(policy, simu.env, True, simu.env_name, study[i], '_post_', j, plot=False)
    chrono.stop()
Exemple #2
0
def study_cem(params) -> None:
    """
    Start a sum study of cem
    :param params: the parameters of the study
    :return: nothing
    """

    assert params.policy_type in ['normal'], 'unsupported policy type'
    # cuda = torch.device('cuda')
    study = params.gradients
    simu = make_simu_from_params(params)
    simu.env.set_file_name(study[0] + '_' + simu.env_name)
    reward_file = None
    print("study : ", study)

    # defixed layers
    params.fix_layers = False

    print("cem study") # cem study
    chrono_cem = Chrono()
    for j in range(params.nb_repet):
        simu.env.reinit()
        if params.policy_type=="normal":
            policy = NormalPolicy(simu.obs_size, 24, 36, 1)
        pw = PolicyWrapper(policy, params.policy_type, simu.env_name, j,params.team_name, params.max_episode_steps)
        all_weights,all_rewards,all_pops,all_pops_scores,is_kept=simu.train(pw, params, policy, False, reward_file, "", study[0], 0, True)
    cem_time = chrono_cem.stop()
    return all_weights,all_rewards,all_pops,all_pops_scores,is_kept
def evaluate_policy(params, env, weights):
    policy = NormalPolicy(env.observation_space.shape[0], 24, 36, 1, params.lr_actor)
    policy.set_weights(weights)
    average_tot_score=0
    for j in range(int(args.nb_evals)):
        state = env.reset()
        total_reward = 0
        for t in range(params.max_episode_steps):
            action = policy.select_action(state, params.deterministic_eval)
                # print("action", action)
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
            state = next_state

            if done:
                average_tot_score+=total_reward/args.nb_evals
                break
    return average_tot_score
def study_regress(params) -> None:
    assert params.policy_type in ['bernoulli', 'normal', 'squashedGaussian'
                                  ], 'unsupported policy type'
    chrono = Chrono()
    study = params.gradients
    simu = make_simu_from_params(params)
    for i in range(len(study)):
        simu.env.set_file_name(study[i] + '_' + simu.env_name)
        policy_loss_file, critic_loss_file = set_files(study[i], simu.env_name)
        print("study : ", study[i])
        for j in range(params.nb_repet):
            simu.env.reinit()
            if params.policy_type == "bernoulli":
                policy = BernoulliPolicy(simu.obs_size, 24, 36, 1,
                                         params.lr_actor)
            elif params.policy_type == "normal":
                policy = NormalPolicy(simu.obs_size, 24, 36, 1,
                                      params.lr_actor)
            elif params.policy_type == "squashedGaussian":
                policy = SquashedGaussianPolicy(simu.obs_size, 24, 36, 1,
                                                params.lr_actor)
            pw = PolicyWrapper(policy, params.policy_type, simu.env_name,
                               params.team_name, params.max_episode_steps)
            plot_policy(policy,
                        simu.env,
                        True,
                        simu.env_name,
                        study[i],
                        '_ante_',
                        j,
                        plot=False)

            if not simu.discrete:
                act_size = simu.env.action_space.shape[0]
                critic = QNetworkContinuous(simu.obs_size + act_size, 24, 36,
                                            1, params.lr_critic)
            else:
                critic = VNetwork(simu.obs_size, 24, 36, 1, params.lr_critic)
            # plot_critic(simu, critic, policy, study[i], '_ante_', j)

            regress(simu, policy, params.policy_type, 250, params.render)
            simu.train(pw, params, policy, critic, policy_loss_file,
                       critic_loss_file, study[i])
            plot_policy(policy,
                        simu.env,
                        True,
                        simu.env_name,
                        study[i],
                        '_post_',
                        j,
                        plot=False)
            plot_critic(simu, critic, policy, study[i], '_post_', j)
            critic.save_model('data/critics/' + params.env_name + '#' +
                              params.team_name + '#' + study[i] + str(j) +
                              '.pt')
    chrono.stop()
Exemple #5
0
def get_same_starting_policies(params):
    simu = make_simu_from_params(params)
    policies = []
    for i in range(params.nb_repet):
        if params.policy_type == 'normal':
            policies.append(
                NormalPolicy(simu.obs_size, 32, 64, 1,
                             params.lr_actor).get_weights())
        elif params.policy_type == 'squashedGaussian':
            policies.append(
                SquashedGaussianPolicy(simu.obs_size, 32, 64, 1,
                                       params.lr_actor).get_weights())
        elif params.policy_type == 'beta':
            policies.append(
                BetaPolicy(simu.obs_size, 32, 64, 1,
                           params.lr_actor).get_weights())
    return policies
Exemple #6
0
def study_beta(params):
    simu = make_simu_from_params(params)
    for beta in [0.1, 0.5, 1.0, 5.0, 10.0]:
        print("beta:", beta)
        policy_loss_file, critic_loss_file = set_files(str(beta), simu.env_name)
        simu.env.set_file_name(str(beta) + '_' + simu.env_name)
        for i in range(params.nb_repet):
            simu.env.reinit()
            if params.policy_type == "bernoulli":
                policy = BernoulliPolicy(simu.obs_size, 24, 36, 1, params.lr_actor)
            elif params.policy_type == "normal":
                policy = NormalPolicy(simu.obs_size, 24, 36, 1, params.lr_actor)
            if not simu.discrete:
                act_size = simu.env.action_space.shape[0]
                critic = QNetworkContinuous(simu.obs_size + act_size, 24, 36, 1, params.lr_critic)
            else:
                critic = VNetwork(simu.obs_size, 24, 36, 1, params.lr_critic)
            pw = PolicyWrapper(policy, params.policy_type, simu.env_name, params.team_name, params.max_episode_steps)
            simu.train(pw, params, policy, critic, policy_loss_file, critic_loss_file, "beta", beta)
def evaluate_policy(params, env, weights):
    """
    Perform an episode using the policy parameter and return the obtained reward
    Used to evaluate an already trained policy, without storing data for further training
    :return: the total reward collected during the episode
    """
    if params.multi_threading:
        ray.init(include_dashboard=False)

        @ray.remote
        def eval(params, nb_evals, sim):
            average_tot_score = 0
            for j in range(nb_evals):
                state = sim.env.reset()
                total_reward = 0
                for t in range(params.max_episode_steps):
                    action = policy.select_action(state,
                                                  params.deterministic_eval)
                    # print("action", action)
                    if params.policy_type == "normal":
                        next_state, reward, done, _ = sim.env.step(action)
                    elif params.policy_type == "beta":
                        if params.env_name == "Pendulum-v0":
                            next_state, reward, done, _ = sim.env.step(
                                2 * (2 * action - 1))
                        elif params.env_name == "CartPoleContinuous-v0":
                            next_state, reward, done, _ = sim.env.step(2 *
                                                                       action -
                                                                       1)
                    total_reward += reward
                    state = next_state
                    if done:
                        # print(total_reward)
                        average_tot_score += total_reward
                        break
            env.close()
            return average_tot_score / nb_evals

        if params.policy_type == "normal":
            policy = NormalPolicy(env.observation_space.shape[0], 32, 64, 1,
                                  params.lr_actor)
        if params.policy_type == "beta":
            policy = BetaPolicy(env.observation_space.shape[0], 32, 64, 1,
                                params.lr_actor)
        policy.set_weights(weights)
        workers = min(16, os.cpu_count() + 4)
        evals = int(params.nb_evals / workers)
        sim_list = []
        for i in range(workers):
            sim_list.append(Simulator(params))
        futures = [eval.remote(params, evals, sim) for sim in sim_list]
        returns = ray.get(futures)
        ray.shutdown()
        average_tot_score = np.sum(returns) / workers
        return average_tot_score
    else:
        if params.policy_type == "normal":
            policy = NormalPolicy(env.observation_space.shape[0], 32, 64, 1,
                                  params.lr_actor)
        if params.policy_type == "beta":
            policy = BetaPolicy(env.observation_space.shape[0], 32, 64, 1,
                                params.lr_actor)
        policy.set_weights(weights)
        average_tot_score = 0
        for j in range(int(args.nb_evals)):
            state = env.reset()
            total_reward = 0
            for t in range(params.max_episode_steps):
                action = policy.select_action(state, params.deterministic_eval)
                next_state, reward, done, _ = env.step(action)
                # if params.policy_type == "normal":
                #     next_state, reward, done, _ = env.step(action)
                # elif params.policy_type == "beta":
                #     if params.env_name == "Pendulum-v0":
                #         next_state, reward, done, _ = env.step(2 * (2 * action - 1))
                #     elif params.env_name == "CartPoleContinuous-v0":
                #         next_state, reward, done, _ = env.step(2 * action - 1)
                total_reward += reward
                state = next_state
                if done:
                    average_tot_score += total_reward / args.nb_evals
                    break
        return average_tot_score
def study_pg(params) -> None:
    """
    Start a study of the policy gradient algorithms
    :param params: the parameters of the study
    :return: nothing
    """
    #### MODIF : added discrete
    assert params.policy_type in [
        'bernoulli', 'normal', 'squashedGaussian', 'discrete'
    ], 'unsupported policy type'
    ####
    chrono = Chrono()
    # cuda = torch.device('cuda')
    study = params.gradients
    simu = make_simu_from_params(params)
    for i in range(len(study)):
        simu.env.set_file_name(study[i] + '_' + simu.env_name)
        policy_loss_file, critic_loss_file = set_files(study[i], simu.env_name)
        print("study : ", study[i])
        for j in range(params.nb_repet):
            simu.env.reinit()
            if params.policy_type == "bernoulli":
                policy = BernoulliPolicy(simu.obs_size, 100, 200, 1,
                                         params.lr_actor)
            #### MODIF : added the discrete policy
            elif params.policy_type == "discrete":
                if isinstance(simu.env.action_space, gym.spaces.box.Box):
                    nb_actions = int(simu.env.action_space.high[0] -
                                     simu.env.action_space.low[0] + 1)
                    print(
                        "Error : environment action space is not discrete :" +
                        str(simu.env.action_space))
                else:
                    nb_actions = simu.env.action_space.n
                policy = DiscretePolicy(simu.obs_size, 24, 36, nb_actions,
                                        params.lr_actor)
            ####
            elif params.policy_type == "normal":
                policy = NormalPolicy(simu.obs_size, 100, 200, 1,
                                      params.lr_actor)
            elif params.policy_type == "squashedGaussian":
                policy = SquashedGaussianPolicy(simu.obs_size, 100, 200, 1,
                                                params.lr_actor)
            elif params.policy_type == "DDPG":
                policy = DDPG(simu.obs_size, 24, 36, 1, params.lr_actor)
            # policy = policy.cuda()
            pw = PolicyWrapper(policy, params.policy_type, simu.env_name,
                               params.team_name, params.max_episode_steps)
            plot_policy(policy,
                        simu.env,
                        True,
                        simu.env_name,
                        study[i],
                        '_ante_',
                        j,
                        plot=False)

            if not simu.discrete:
                act_size = simu.env.action_space.shape[0]
                critic = QNetworkContinuous(simu.obs_size + act_size, 24, 36,
                                            1, params.lr_critic)
            else:
                critic = VNetwork(simu.obs_size, 24, 36, 1, params.lr_critic)
            # plot_critic(simu, critic, policy, study[i], '_ante_', j)

            simu.train(pw, params, policy, critic, policy_loss_file,
                       critic_loss_file, study[i])
            plot_policy(policy,
                        simu.env,
                        True,
                        simu.env_name,
                        study[i],
                        '_post_',
                        j,
                        plot=False)
            if False:
                if params.policy_type == "normal":
                    plot_normal_histograms(policy, j, simu.env_name)
                else:
                    plot_weight_histograms(policy, j, simu.env_name)
        plot_critic(simu, critic, policy, study[i], '_post_', j)
        critic.save_model('data/critics/' + params.env_name + '#' +
                          params.team_name + '#' + study[i] + str(j) + '.pt')
    chrono.stop()