コード例 #1
0
    def train(self,
              pw,
              params,
              policy,
              critic,
              policy_loss_file,
              critic_loss_file,
              study_name,
              beta=0) -> None:
        """
        The main function for training and evaluating a policy
        Repeats training and evaluation params.nb_cycles times
        Stores the value and policy losses at each cycle
        When the reward is greater than the best reward so far, saves the corresponding policy
        :param pw: a policy wrapper, used to save the best policy into a file
        :param params: the hyper-parameters of the run, specified in arguments.py or in the command line
        :param policy: the trained policy
        :param critic: the corresponding critic (not always used)
        :param policy_loss_file: the file to record successive policy loss values
        :param critic_loss_file: the file to record successive critic loss values
        :param study_name: the name of the studied gradient algorithm
        :param beta: a specific parameter for beta-parametrized values
        :return: nothing
        """
        for cycle in range(params.nb_cycles):
            batch = self.make_monte_carlo_batch(params.nb_trajs, params.render,
                                                policy)

            # Update the policy
            batch2 = batch.copy_batch()
            algo = Algo(study_name, params.critic_estim_method, policy, critic,
                        params.gamma, beta, params.nstep)
            algo.prepare_batch(batch)
            policy_loss = batch.train_policy_td(policy)

            # Update the critic
            assert params.critic_update_method in [
                'batch', 'dataset'
            ], 'unsupported critic update method'
            if params.critic_update_method == "dataset":
                critic_loss = algo.train_critic_from_dataset(batch2, params)
            elif params.critic_update_method == "batch":
                critic_loss = algo.train_critic_from_batch(batch2)
            critic_loss_file.write(str(cycle) + " " + str(critic_loss) + "\n")
            policy_loss_file.write(str(cycle) + " " + str(policy_loss) + "\n")

            # policy evaluation part
            total_reward = self.evaluate_episode(policy,
                                                 params.deterministic_eval)
            # plot_trajectory(batch2, self.env, cycle+1)

            # save best reward agent (no need for averaging if the policy is deterministic)
            if self.best_reward < total_reward:
                self.best_reward = total_reward
                pw.save(self.best_reward)
コード例 #2
0
ファイル: simu.py プロジェクト: KohlerHECTOR/PANDROIDE
    def train(self, pw,params, policy, critic, policy_loss_file, critic_loss_file, study_name, beta=0, is_cem=False):
        all_weights=np.zeros((int(params.nb_cycles+1),policy.get_weights_dim(False)))
        all_rewards=np.zeros(params.nb_cycles+1)
        best_reward=-np.inf
        best_weights=np.zeros(policy.get_weights_dim(False))

        all_pops=np.zeros((params.nb_cycles,params.population,policy.get_weights_dim(False)))
        all_pops_scores= np.zeros((params.nb_cycles,params.population))
        # is_kept = np.zeros((params.nb_cycles,params.population))
        list_elite_index=np.zeros((params.nb_cycles,int(params.elites_frac * params.population)))
        fixed=params.fix_layers
        idx_best=0
        if is_cem == False:
            if fixed:
                print(fixed)
                fc1_w, fc1_b, fc2_w, fc2_b = policy.get_weights_pg()
                # print(fc1_w)
                # print(policy.test())

        if is_cem == True:
            all_weights=np.zeros((int(params.nb_cycles+1),policy.get_weights_dim(fixed)))
            best_weights=np.zeros(policy.get_weights_dim(fixed))
            #random init of the neural network.
            #so far, all the layers are initialized with the same gaussian.
            init_weights = np.array(params.sigma*np.random.randn(policy.get_weights_dim(False)))
            #print(np.shape(init_weights))
            #start_weights=np.array(3*np.random.randn(policy.get_weights_dim(False)))
            policy.set_weights(init_weights, False)

            print(fixed)
            #print(params.fix_layers)
            #print(policy.get_weights_dim(params.fix_layers))
            study = params.study_name
            noise=np.diag(np.ones(policy.get_weights_dim(fixed))*params.sigma)
            #print(np.shape(noise))
            #var=np.cov(init_weights[:,-policy.get_weights_dim(fixed):],rowvar=False) + noise
            #mu=init_weights[:,-policy.get_weights_dim(fixed):].mean(axis=0)

            var=np.diag(np.ones(policy.get_weights_dim(fixed))*np.var(init_weights))+noise
            print(np.shape(var))
            mu=init_weights[-policy.get_weights_dim(fixed):]
            all_weights[0]=mu
            all_rewards[0]=self.evaluate_episode(policy, params.deterministic_eval)
            print(np.shape(mu))
            rng = np.random.default_rng()

            #we can draw the last layer from a different gaussian
            #mu=params.sigma_bis*np.random.randn(policy.get_weights_dim(params.fix_layers))
        for cycle in range(params.nb_cycles):
            if is_cem == True:
                rewards = np.zeros(params.population)
                weights=rng.multivariate_normal(mu, var, params.population)
                for p in range(params.population):
                    policy.set_weights(weights[p], fixed)
                    batch=self.make_monte_carlo_batch(params.nb_trajs_cem, params.render, policy, True)
                    rewards[p] = batch.train_policy_cem(policy, params.bests_frac)
                    all_pops[cycle,p]=weights[p]
                    all_pops_scores[cycle,p]=rewards[p]

                elites_nb = int(params.elites_frac * params.population)
                elites_idxs = rewards.argsort()[-elites_nb:]
                list_elite_index[cycle]=elites_idxs
                # for i in elites_idxs:
                #     is_kept[cycle][i]=1

                elites_weights = [weights[i] for i in elites_idxs]
                #update the best weights
                mu = np.array(elites_weights).mean(axis=0)
                var = np.cov(elites_weights,rowvar=False)+noise

                #print(best_weights)
                # policy evaluation part
                policy.set_weights(mu, fixed)

                total_reward = self.evaluate_episode(policy, params.deterministic_eval)

                if total_reward>best_reward:
                    best_weights=mu
                    best_reward=total_reward
                    idx_best=cycle
                all_rewards[cycle+1]=total_reward
                # if total_reward>np.min(top_ten_scores):
                #     temp_min=np.argmin(top_ten_scores)
                #     top_ten_scores[temp_min]=total_reward
                #     top_ten_policies[temp_min]=mu

                # Update the file for the plot
                # reward_file = policy_loss_file
                # reward_file.write(str(cycle) + " " + str(total_reward) + "\n")
                # if (cycle+1)%3==0:
                    # all_weights[int((cycle+1)/3)-1]=mu
                all_weights[cycle+1]=mu

            elif is_cem == False:
                batch = self.make_monte_carlo_batch(params.nb_trajs_pg, params.render, policy)

                # Update the policy
                batch2 = batch.copy_batch()
                algo = Algo(study_name, params.critic_estim_method, policy, critic, params.gamma, beta, params.nstep)
                algo.prepare_batch(batch)
                policy_loss = batch.train_policy_td(policy)
                # if (cycle+1)%3==0:
                #     all_weights[int((cycle+1)/3)-1]=policy.get_weights_as_numpy()
                all_weights[cycle]=policy.get_weights_as_numpy()
                #print(policy_loss)

                # Update the critic
                assert params.critic_update_method in ['batch', 'dataset'], 'unsupported critic update method'
                if params.critic_update_method == "dataset":
                    critic_loss = algo.train_critic_from_dataset(batch2, params)
                elif params.critic_update_method == "batch":
                    critic_loss = algo.train_critic_from_batch(batch2)
                critic_loss_file.write(str(cycle) + " " + str(critic_loss) + "\n")
                policy_loss_file.write(str(cycle) + " " + str(policy_loss) + "\n")
                plot_trajectory(batch2, self.env, cycle+1)

                # policy evaluation part
                if fixed:
                    policy.set_weights_pg(fc1_w, fc1_b, fc2_w, fc2_b)
                total_reward = self.evaluate_episode(policy, params.deterministic_eval)
                all_rewards[cycle]=total_reward
                if total_reward>best_reward:
                    best_weights=policy.get_weights_as_numpy()
                    best_reward=total_reward
                    idx_best=cycle
            print(total_reward)
        # X_embedded = TSNE(n_components=2).fit_transform(all_cem_weights)
        # # print(np.shape(X_embedded))
        # # print(X_embedded)
        # plt.scatter(*zip(*X_embedded))
        # return all_weights,best_weights,all_rewards,idx_best
        return all_weights,all_rewards,all_pops,all_pops_scores,list_elite_index
コード例 #3
0
ファイル: simu.py プロジェクト: KohlerHECTOR/PANDROIDE
    def train_pg(self,
                 pw,
                 params,
                 policy,
                 critic,
                 policy_loss_file,
                 critic_loss_file,
                 study_name,
                 beta=0) -> None:
        """
        The main function for training and evaluating a policy
        Repeats training and evaluation params.nb_cycles times
        Stores the value and policy losses at each cycle
        When the reward is greater than the best reward so far, saves the corresponding policy
        :param pw: a policy wrapper, used to save the best policy into a file
        :param params: the hyper-parameters of the run, specified in arguments.py or in the command line
        :param policy: the trained policy
        :param policy_loss_file: the file to record successive policy loss values
        :return: nothing
        """
        # Initialize variables
        self.list_weights = []
        self.best_weights = np.zeros(policy.get_weights_dim())
        self.list_rewards = np.zeros((int(params.nb_cycles)))
        self.best_reward = -1e38
        self.best_weights_idx = 0
        total_reward = self.best_reward
        self.list_weights.append(policy.get_weights())

        if params.start_from_policy:
            starting_weights = get_starting_weights(pw)
            policy.set_weights(starting_weights)

        print("Shape of weights vector is: ", np.shape(self.best_weights))
        initial_score = self.evaluate_episode(policy,
                                              params.deterministic_eval,
                                              params)
        total_reward = initial_score
        pw.save(cycle=0, score=initial_score)
        self.env.write_reward(cycle=0, reward=initial_score)
        with SlowBar('Performing a repetition of PG',
                     max=params.nb_cycles - 1) as bar:
            for cycle in range(1, params.nb_cycles):
                batch = self.make_monte_carlo_batch(params.nb_trajs,
                                                    params.render, policy)
                if params.reinforce:
                    batch.sum_rewards()
                    policy_loss = batch.train_policy_td(policy)
                    # self.env.write_gradients(gradient_angles,cycle)
                    policy_loss_file.write(
                        str(cycle) + " " + str(policy_loss) + "\n")
                    batch = self.make_monte_carlo_batch(
                        params.nb_trajs, params.render, policy)

                else:
                    # Update the policy
                    batch2 = batch.copy_batch()
                    algo = Algo(params.study_name, params.critic_estim_method,
                                policy, critic, params.gamma, beta,
                                params.nstep)
                    algo.prepare_batch(batch)
                    policy_loss = batch.train_policy_td(policy)

                    # Update the critic
                    assert params.critic_update_method in [
                        'batch', 'dataset'
                    ], 'unsupported critic update method'
                    if params.critic_update_method == "dataset":
                        critic_loss = algo.train_critic_from_dataset(
                            batch2, params)
                    elif params.critic_update_method == "batch":
                        critic_loss = algo.train_critic_from_batch(batch2)
                    critic_loss_file.write(
                        str(cycle) + " " + str(critic_loss) + "\n")
                    policy_loss_file.write(
                        str(cycle) + " " + str(policy_loss) + "\n")
                    plot_trajectory(batch2, self.env, cycle + 1)

                # add the new weights to the list of weights
                self.list_weights.append(policy.get_weights())
                distance = np.linalg.norm(self.list_weights[-1] -
                                          self.list_weights[-2])
                self.env.write_distances(cycle, distance)
                self.write_angles_global(cycle)

                # policy evaluation part
                if (cycle % params.eval_freq) == 0:
                    total_reward = self.evaluate_episode(
                        policy, params.deterministic_eval, params)
                    # wrote and store reward
                    self.env.write_reward(cycle, total_reward)
                    self.list_rewards[cycle] = total_reward
                    # plot_trajectory(batch2, self.env, cycle+1)

                # save best reward agent (no need for averaging if the policy is deterministic)
                if self.best_reward < total_reward:
                    self.best_reward = total_reward
                    self.best_weights = self.list_weights[-1]
                    self.best_weights_idx = cycle
                # Save the best policy obtained
                if (cycle % params.save_freq) == 0:
                    pw.save(cycle=cycle, score=total_reward)
                bar.next()

        # pw.rename_best(method="PG",best_cycle=self.best_weights_idx,best_score=self.best_reward)
        print("Best reward: ", self.best_reward)
        print("Best reward iter: ", self.best_weights_idx)