def study_cem(params, starting_pol=None) -> None: """ Start a study of CEM algorithms :param params: the parameters of the study :param starting_pol: initial policy :return: nothing """ assert params.policy_type in ['squashedGaussian', 'normal', 'beta'], 'unsupported policy type' chrono = Chrono() # cuda = torch.device('cuda') study = params.gradients if params.nb_trajs_cem is not None: params.nb_trajs = params.nb_trajs_cem simu = make_simu_from_params(params) for i in range(1): # len(study) Only sum here simu.env.set_file_name('cem' + study[i] + '_' + simu.env_name) print("study : ", study[i]) for j in range(params.nb_repet): simu.env.reinit() if params.policy_type == "squashedGaussian": policy = SquashedGaussianPolicy(simu.obs_size, 32, 64, 1) elif params.policy_type == "normal": policy = NormalPolicy(simu.obs_size, 32, 64, 1) elif params.policy_type == "beta": policy = BetaPolicy(simu.obs_size, 32, 64, 1) if starting_pol is not None: policy.set_weights(starting_pol[j]) pw = PolicyWrapper(policy, j, params.policy_type, simu.env_name, params.team_name, params.max_episode_steps) # plot_policy(policy, simu.env, True, simu.env_name, study[i], '_ante_', j, plot=False) simu.train_cem(pw, params, policy) # plot_policy(policy, simu.env, True, simu.env_name, study[i], '_post_', j, plot=False) chrono.stop()
def study_cem(params) -> None: """ Start a sum study of cem :param params: the parameters of the study :return: nothing """ assert params.policy_type in ['normal'], 'unsupported policy type' # cuda = torch.device('cuda') study = params.gradients simu = make_simu_from_params(params) simu.env.set_file_name(study[0] + '_' + simu.env_name) reward_file = None print("study : ", study) # defixed layers params.fix_layers = False print("cem study") # cem study chrono_cem = Chrono() for j in range(params.nb_repet): simu.env.reinit() if params.policy_type=="normal": policy = NormalPolicy(simu.obs_size, 24, 36, 1) pw = PolicyWrapper(policy, params.policy_type, simu.env_name, j,params.team_name, params.max_episode_steps) all_weights,all_rewards,all_pops,all_pops_scores,is_kept=simu.train(pw, params, policy, False, reward_file, "", study[0], 0, True) cem_time = chrono_cem.stop() return all_weights,all_rewards,all_pops,all_pops_scores,is_kept
def evaluate_policy(params, env, weights): policy = NormalPolicy(env.observation_space.shape[0], 24, 36, 1, params.lr_actor) policy.set_weights(weights) average_tot_score=0 for j in range(int(args.nb_evals)): state = env.reset() total_reward = 0 for t in range(params.max_episode_steps): action = policy.select_action(state, params.deterministic_eval) # print("action", action) next_state, reward, done, _ = env.step(action) total_reward += reward state = next_state if done: average_tot_score+=total_reward/args.nb_evals break return average_tot_score
def study_regress(params) -> None: assert params.policy_type in ['bernoulli', 'normal', 'squashedGaussian' ], 'unsupported policy type' chrono = Chrono() study = params.gradients simu = make_simu_from_params(params) for i in range(len(study)): simu.env.set_file_name(study[i] + '_' + simu.env_name) policy_loss_file, critic_loss_file = set_files(study[i], simu.env_name) print("study : ", study[i]) for j in range(params.nb_repet): simu.env.reinit() if params.policy_type == "bernoulli": policy = BernoulliPolicy(simu.obs_size, 24, 36, 1, params.lr_actor) elif params.policy_type == "normal": policy = NormalPolicy(simu.obs_size, 24, 36, 1, params.lr_actor) elif params.policy_type == "squashedGaussian": policy = SquashedGaussianPolicy(simu.obs_size, 24, 36, 1, params.lr_actor) pw = PolicyWrapper(policy, params.policy_type, simu.env_name, params.team_name, params.max_episode_steps) plot_policy(policy, simu.env, True, simu.env_name, study[i], '_ante_', j, plot=False) if not simu.discrete: act_size = simu.env.action_space.shape[0] critic = QNetworkContinuous(simu.obs_size + act_size, 24, 36, 1, params.lr_critic) else: critic = VNetwork(simu.obs_size, 24, 36, 1, params.lr_critic) # plot_critic(simu, critic, policy, study[i], '_ante_', j) regress(simu, policy, params.policy_type, 250, params.render) simu.train(pw, params, policy, critic, policy_loss_file, critic_loss_file, study[i]) plot_policy(policy, simu.env, True, simu.env_name, study[i], '_post_', j, plot=False) plot_critic(simu, critic, policy, study[i], '_post_', j) critic.save_model('data/critics/' + params.env_name + '#' + params.team_name + '#' + study[i] + str(j) + '.pt') chrono.stop()
def get_same_starting_policies(params): simu = make_simu_from_params(params) policies = [] for i in range(params.nb_repet): if params.policy_type == 'normal': policies.append( NormalPolicy(simu.obs_size, 32, 64, 1, params.lr_actor).get_weights()) elif params.policy_type == 'squashedGaussian': policies.append( SquashedGaussianPolicy(simu.obs_size, 32, 64, 1, params.lr_actor).get_weights()) elif params.policy_type == 'beta': policies.append( BetaPolicy(simu.obs_size, 32, 64, 1, params.lr_actor).get_weights()) return policies
def study_beta(params): simu = make_simu_from_params(params) for beta in [0.1, 0.5, 1.0, 5.0, 10.0]: print("beta:", beta) policy_loss_file, critic_loss_file = set_files(str(beta), simu.env_name) simu.env.set_file_name(str(beta) + '_' + simu.env_name) for i in range(params.nb_repet): simu.env.reinit() if params.policy_type == "bernoulli": policy = BernoulliPolicy(simu.obs_size, 24, 36, 1, params.lr_actor) elif params.policy_type == "normal": policy = NormalPolicy(simu.obs_size, 24, 36, 1, params.lr_actor) if not simu.discrete: act_size = simu.env.action_space.shape[0] critic = QNetworkContinuous(simu.obs_size + act_size, 24, 36, 1, params.lr_critic) else: critic = VNetwork(simu.obs_size, 24, 36, 1, params.lr_critic) pw = PolicyWrapper(policy, params.policy_type, simu.env_name, params.team_name, params.max_episode_steps) simu.train(pw, params, policy, critic, policy_loss_file, critic_loss_file, "beta", beta)
def evaluate_policy(params, env, weights): """ Perform an episode using the policy parameter and return the obtained reward Used to evaluate an already trained policy, without storing data for further training :return: the total reward collected during the episode """ if params.multi_threading: ray.init(include_dashboard=False) @ray.remote def eval(params, nb_evals, sim): average_tot_score = 0 for j in range(nb_evals): state = sim.env.reset() total_reward = 0 for t in range(params.max_episode_steps): action = policy.select_action(state, params.deterministic_eval) # print("action", action) if params.policy_type == "normal": next_state, reward, done, _ = sim.env.step(action) elif params.policy_type == "beta": if params.env_name == "Pendulum-v0": next_state, reward, done, _ = sim.env.step( 2 * (2 * action - 1)) elif params.env_name == "CartPoleContinuous-v0": next_state, reward, done, _ = sim.env.step(2 * action - 1) total_reward += reward state = next_state if done: # print(total_reward) average_tot_score += total_reward break env.close() return average_tot_score / nb_evals if params.policy_type == "normal": policy = NormalPolicy(env.observation_space.shape[0], 32, 64, 1, params.lr_actor) if params.policy_type == "beta": policy = BetaPolicy(env.observation_space.shape[0], 32, 64, 1, params.lr_actor) policy.set_weights(weights) workers = min(16, os.cpu_count() + 4) evals = int(params.nb_evals / workers) sim_list = [] for i in range(workers): sim_list.append(Simulator(params)) futures = [eval.remote(params, evals, sim) for sim in sim_list] returns = ray.get(futures) ray.shutdown() average_tot_score = np.sum(returns) / workers return average_tot_score else: if params.policy_type == "normal": policy = NormalPolicy(env.observation_space.shape[0], 32, 64, 1, params.lr_actor) if params.policy_type == "beta": policy = BetaPolicy(env.observation_space.shape[0], 32, 64, 1, params.lr_actor) policy.set_weights(weights) average_tot_score = 0 for j in range(int(args.nb_evals)): state = env.reset() total_reward = 0 for t in range(params.max_episode_steps): action = policy.select_action(state, params.deterministic_eval) next_state, reward, done, _ = env.step(action) # if params.policy_type == "normal": # next_state, reward, done, _ = env.step(action) # elif params.policy_type == "beta": # if params.env_name == "Pendulum-v0": # next_state, reward, done, _ = env.step(2 * (2 * action - 1)) # elif params.env_name == "CartPoleContinuous-v0": # next_state, reward, done, _ = env.step(2 * action - 1) total_reward += reward state = next_state if done: average_tot_score += total_reward / args.nb_evals break return average_tot_score
def study_pg(params) -> None: """ Start a study of the policy gradient algorithms :param params: the parameters of the study :return: nothing """ #### MODIF : added discrete assert params.policy_type in [ 'bernoulli', 'normal', 'squashedGaussian', 'discrete' ], 'unsupported policy type' #### chrono = Chrono() # cuda = torch.device('cuda') study = params.gradients simu = make_simu_from_params(params) for i in range(len(study)): simu.env.set_file_name(study[i] + '_' + simu.env_name) policy_loss_file, critic_loss_file = set_files(study[i], simu.env_name) print("study : ", study[i]) for j in range(params.nb_repet): simu.env.reinit() if params.policy_type == "bernoulli": policy = BernoulliPolicy(simu.obs_size, 100, 200, 1, params.lr_actor) #### MODIF : added the discrete policy elif params.policy_type == "discrete": if isinstance(simu.env.action_space, gym.spaces.box.Box): nb_actions = int(simu.env.action_space.high[0] - simu.env.action_space.low[0] + 1) print( "Error : environment action space is not discrete :" + str(simu.env.action_space)) else: nb_actions = simu.env.action_space.n policy = DiscretePolicy(simu.obs_size, 24, 36, nb_actions, params.lr_actor) #### elif params.policy_type == "normal": policy = NormalPolicy(simu.obs_size, 100, 200, 1, params.lr_actor) elif params.policy_type == "squashedGaussian": policy = SquashedGaussianPolicy(simu.obs_size, 100, 200, 1, params.lr_actor) elif params.policy_type == "DDPG": policy = DDPG(simu.obs_size, 24, 36, 1, params.lr_actor) # policy = policy.cuda() pw = PolicyWrapper(policy, params.policy_type, simu.env_name, params.team_name, params.max_episode_steps) plot_policy(policy, simu.env, True, simu.env_name, study[i], '_ante_', j, plot=False) if not simu.discrete: act_size = simu.env.action_space.shape[0] critic = QNetworkContinuous(simu.obs_size + act_size, 24, 36, 1, params.lr_critic) else: critic = VNetwork(simu.obs_size, 24, 36, 1, params.lr_critic) # plot_critic(simu, critic, policy, study[i], '_ante_', j) simu.train(pw, params, policy, critic, policy_loss_file, critic_loss_file, study[i]) plot_policy(policy, simu.env, True, simu.env_name, study[i], '_post_', j, plot=False) if False: if params.policy_type == "normal": plot_normal_histograms(policy, j, simu.env_name) else: plot_weight_histograms(policy, j, simu.env_name) plot_critic(simu, critic, policy, study[i], '_post_', j) critic.save_model('data/critics/' + params.env_name + '#' + params.team_name + '#' + study[i] + str(j) + '.pt') chrono.stop()