def train(self, pw, params, policy, critic, policy_loss_file, critic_loss_file, study_name, beta=0) -> None: """ The main function for training and evaluating a policy Repeats training and evaluation params.nb_cycles times Stores the value and policy losses at each cycle When the reward is greater than the best reward so far, saves the corresponding policy :param pw: a policy wrapper, used to save the best policy into a file :param params: the hyper-parameters of the run, specified in arguments.py or in the command line :param policy: the trained policy :param critic: the corresponding critic (not always used) :param policy_loss_file: the file to record successive policy loss values :param critic_loss_file: the file to record successive critic loss values :param study_name: the name of the studied gradient algorithm :param beta: a specific parameter for beta-parametrized values :return: nothing """ for cycle in range(params.nb_cycles): batch = self.make_monte_carlo_batch(params.nb_trajs, params.render, policy) # Update the policy batch2 = batch.copy_batch() algo = Algo(study_name, params.critic_estim_method, policy, critic, params.gamma, beta, params.nstep) algo.prepare_batch(batch) policy_loss = batch.train_policy_td(policy) # Update the critic assert params.critic_update_method in [ 'batch', 'dataset' ], 'unsupported critic update method' if params.critic_update_method == "dataset": critic_loss = algo.train_critic_from_dataset(batch2, params) elif params.critic_update_method == "batch": critic_loss = algo.train_critic_from_batch(batch2) critic_loss_file.write(str(cycle) + " " + str(critic_loss) + "\n") policy_loss_file.write(str(cycle) + " " + str(policy_loss) + "\n") # policy evaluation part total_reward = self.evaluate_episode(policy, params.deterministic_eval) # plot_trajectory(batch2, self.env, cycle+1) # save best reward agent (no need for averaging if the policy is deterministic) if self.best_reward < total_reward: self.best_reward = total_reward pw.save(self.best_reward)
def train(self, pw,params, policy, critic, policy_loss_file, critic_loss_file, study_name, beta=0, is_cem=False): all_weights=np.zeros((int(params.nb_cycles+1),policy.get_weights_dim(False))) all_rewards=np.zeros(params.nb_cycles+1) best_reward=-np.inf best_weights=np.zeros(policy.get_weights_dim(False)) all_pops=np.zeros((params.nb_cycles,params.population,policy.get_weights_dim(False))) all_pops_scores= np.zeros((params.nb_cycles,params.population)) # is_kept = np.zeros((params.nb_cycles,params.population)) list_elite_index=np.zeros((params.nb_cycles,int(params.elites_frac * params.population))) fixed=params.fix_layers idx_best=0 if is_cem == False: if fixed: print(fixed) fc1_w, fc1_b, fc2_w, fc2_b = policy.get_weights_pg() # print(fc1_w) # print(policy.test()) if is_cem == True: all_weights=np.zeros((int(params.nb_cycles+1),policy.get_weights_dim(fixed))) best_weights=np.zeros(policy.get_weights_dim(fixed)) #random init of the neural network. #so far, all the layers are initialized with the same gaussian. init_weights = np.array(params.sigma*np.random.randn(policy.get_weights_dim(False))) #print(np.shape(init_weights)) #start_weights=np.array(3*np.random.randn(policy.get_weights_dim(False))) policy.set_weights(init_weights, False) print(fixed) #print(params.fix_layers) #print(policy.get_weights_dim(params.fix_layers)) study = params.study_name noise=np.diag(np.ones(policy.get_weights_dim(fixed))*params.sigma) #print(np.shape(noise)) #var=np.cov(init_weights[:,-policy.get_weights_dim(fixed):],rowvar=False) + noise #mu=init_weights[:,-policy.get_weights_dim(fixed):].mean(axis=0) var=np.diag(np.ones(policy.get_weights_dim(fixed))*np.var(init_weights))+noise print(np.shape(var)) mu=init_weights[-policy.get_weights_dim(fixed):] all_weights[0]=mu all_rewards[0]=self.evaluate_episode(policy, params.deterministic_eval) print(np.shape(mu)) rng = np.random.default_rng() #we can draw the last layer from a different gaussian #mu=params.sigma_bis*np.random.randn(policy.get_weights_dim(params.fix_layers)) for cycle in range(params.nb_cycles): if is_cem == True: rewards = np.zeros(params.population) weights=rng.multivariate_normal(mu, var, params.population) for p in range(params.population): policy.set_weights(weights[p], fixed) batch=self.make_monte_carlo_batch(params.nb_trajs_cem, params.render, policy, True) rewards[p] = batch.train_policy_cem(policy, params.bests_frac) all_pops[cycle,p]=weights[p] all_pops_scores[cycle,p]=rewards[p] elites_nb = int(params.elites_frac * params.population) elites_idxs = rewards.argsort()[-elites_nb:] list_elite_index[cycle]=elites_idxs # for i in elites_idxs: # is_kept[cycle][i]=1 elites_weights = [weights[i] for i in elites_idxs] #update the best weights mu = np.array(elites_weights).mean(axis=0) var = np.cov(elites_weights,rowvar=False)+noise #print(best_weights) # policy evaluation part policy.set_weights(mu, fixed) total_reward = self.evaluate_episode(policy, params.deterministic_eval) if total_reward>best_reward: best_weights=mu best_reward=total_reward idx_best=cycle all_rewards[cycle+1]=total_reward # if total_reward>np.min(top_ten_scores): # temp_min=np.argmin(top_ten_scores) # top_ten_scores[temp_min]=total_reward # top_ten_policies[temp_min]=mu # Update the file for the plot # reward_file = policy_loss_file # reward_file.write(str(cycle) + " " + str(total_reward) + "\n") # if (cycle+1)%3==0: # all_weights[int((cycle+1)/3)-1]=mu all_weights[cycle+1]=mu elif is_cem == False: batch = self.make_monte_carlo_batch(params.nb_trajs_pg, params.render, policy) # Update the policy batch2 = batch.copy_batch() algo = Algo(study_name, params.critic_estim_method, policy, critic, params.gamma, beta, params.nstep) algo.prepare_batch(batch) policy_loss = batch.train_policy_td(policy) # if (cycle+1)%3==0: # all_weights[int((cycle+1)/3)-1]=policy.get_weights_as_numpy() all_weights[cycle]=policy.get_weights_as_numpy() #print(policy_loss) # Update the critic assert params.critic_update_method in ['batch', 'dataset'], 'unsupported critic update method' if params.critic_update_method == "dataset": critic_loss = algo.train_critic_from_dataset(batch2, params) elif params.critic_update_method == "batch": critic_loss = algo.train_critic_from_batch(batch2) critic_loss_file.write(str(cycle) + " " + str(critic_loss) + "\n") policy_loss_file.write(str(cycle) + " " + str(policy_loss) + "\n") plot_trajectory(batch2, self.env, cycle+1) # policy evaluation part if fixed: policy.set_weights_pg(fc1_w, fc1_b, fc2_w, fc2_b) total_reward = self.evaluate_episode(policy, params.deterministic_eval) all_rewards[cycle]=total_reward if total_reward>best_reward: best_weights=policy.get_weights_as_numpy() best_reward=total_reward idx_best=cycle print(total_reward) # X_embedded = TSNE(n_components=2).fit_transform(all_cem_weights) # # print(np.shape(X_embedded)) # # print(X_embedded) # plt.scatter(*zip(*X_embedded)) # return all_weights,best_weights,all_rewards,idx_best return all_weights,all_rewards,all_pops,all_pops_scores,list_elite_index
def train_pg(self, pw, params, policy, critic, policy_loss_file, critic_loss_file, study_name, beta=0) -> None: """ The main function for training and evaluating a policy Repeats training and evaluation params.nb_cycles times Stores the value and policy losses at each cycle When the reward is greater than the best reward so far, saves the corresponding policy :param pw: a policy wrapper, used to save the best policy into a file :param params: the hyper-parameters of the run, specified in arguments.py or in the command line :param policy: the trained policy :param policy_loss_file: the file to record successive policy loss values :return: nothing """ # Initialize variables self.list_weights = [] self.best_weights = np.zeros(policy.get_weights_dim()) self.list_rewards = np.zeros((int(params.nb_cycles))) self.best_reward = -1e38 self.best_weights_idx = 0 total_reward = self.best_reward self.list_weights.append(policy.get_weights()) if params.start_from_policy: starting_weights = get_starting_weights(pw) policy.set_weights(starting_weights) print("Shape of weights vector is: ", np.shape(self.best_weights)) initial_score = self.evaluate_episode(policy, params.deterministic_eval, params) total_reward = initial_score pw.save(cycle=0, score=initial_score) self.env.write_reward(cycle=0, reward=initial_score) with SlowBar('Performing a repetition of PG', max=params.nb_cycles - 1) as bar: for cycle in range(1, params.nb_cycles): batch = self.make_monte_carlo_batch(params.nb_trajs, params.render, policy) if params.reinforce: batch.sum_rewards() policy_loss = batch.train_policy_td(policy) # self.env.write_gradients(gradient_angles,cycle) policy_loss_file.write( str(cycle) + " " + str(policy_loss) + "\n") batch = self.make_monte_carlo_batch( params.nb_trajs, params.render, policy) else: # Update the policy batch2 = batch.copy_batch() algo = Algo(params.study_name, params.critic_estim_method, policy, critic, params.gamma, beta, params.nstep) algo.prepare_batch(batch) policy_loss = batch.train_policy_td(policy) # Update the critic assert params.critic_update_method in [ 'batch', 'dataset' ], 'unsupported critic update method' if params.critic_update_method == "dataset": critic_loss = algo.train_critic_from_dataset( batch2, params) elif params.critic_update_method == "batch": critic_loss = algo.train_critic_from_batch(batch2) critic_loss_file.write( str(cycle) + " " + str(critic_loss) + "\n") policy_loss_file.write( str(cycle) + " " + str(policy_loss) + "\n") plot_trajectory(batch2, self.env, cycle + 1) # add the new weights to the list of weights self.list_weights.append(policy.get_weights()) distance = np.linalg.norm(self.list_weights[-1] - self.list_weights[-2]) self.env.write_distances(cycle, distance) self.write_angles_global(cycle) # policy evaluation part if (cycle % params.eval_freq) == 0: total_reward = self.evaluate_episode( policy, params.deterministic_eval, params) # wrote and store reward self.env.write_reward(cycle, total_reward) self.list_rewards[cycle] = total_reward # plot_trajectory(batch2, self.env, cycle+1) # save best reward agent (no need for averaging if the policy is deterministic) if self.best_reward < total_reward: self.best_reward = total_reward self.best_weights = self.list_weights[-1] self.best_weights_idx = cycle # Save the best policy obtained if (cycle % params.save_freq) == 0: pw.save(cycle=cycle, score=total_reward) bar.next() # pw.rename_best(method="PG",best_cycle=self.best_weights_idx,best_score=self.best_reward) print("Best reward: ", self.best_reward) print("Best reward iter: ", self.best_weights_idx)