Ejemplo n.º 1
0
    def step(self) -> Tuple[List[float], List[float]]:
        """
        Optimizes the entire population with antithetic sampling.
        Returns training population rewards and timesteps for the current step.
        """
        perturbs = self._sample_pertrubations(self.num_agents)
        perturbs_rew = []
        perturbs_timesteps = []
        centroid_parameters = unroll_parameters(self.centroid.parameters())

        report_rew = []
        for ind in range(0, self.num_agents):
            perturb = perturbs[ind]

            # Initialize agent with perturbed parameters
            self.agent.policy.init_from_parameters(centroid_parameters +
                                                   perturb)
            reward, timesteps = self.agent.train_rollout(
                self.num_trials, self.env_name, self.seed)

            # Initialize agent with anti perturbed parameters
            self.agent.policy.init_from_parameters(centroid_parameters -
                                                   perturb)
            reward_anti, timesteps_anti = self.agent.train_rollout(
                self.num_trials, self.env_name, self.seed)

            perturbs_rew.append(reward - reward_anti)

            perturbs_timesteps.append(timesteps)
            perturbs_timesteps.append(timesteps_anti)
            report_rew.append(reward)
            report_rew.append(reward_anti)

        # Transform rewards as in Salimans et al. (2017)
        transformed_rews = compute_centered_ranks(np.array(perturbs_rew))

        # GRADIENT ASCENT
        perturbs = np.stack(perturbs)
        total_grad = torch.zeros(self.num_parameters)
        for ind in range(0, self.num_agents):
            grad = torch.tensor(transformed_rews[ind] * perturbs[ind])
            total_grad += grad * (self.lr) / (2 * self.num_agents *
                                              self.weights_std**2)

        self.grads.append(total_grad)
        self.grads.update_orthogonal()
        centroid_parameters += total_grad
        # Update the centroid
        self.centroid.init_from_parameters(centroid_parameters)
        print("Gradient norm: {}".format(torch.norm(grad, p=2)))

        return report_rew, perturbs_timesteps
Ejemplo n.º 2
0
    def run_es(self):
        """ Runs Evolution Strategies.

        Tricks used:
            - Antithetic (i.e. mirrored) sampling.
            - Rank transformation, using OpenAI's code.

        Tricks avoided:
            - Fixed Gaussian block. I like to just regenerate here.
            - Virtual batch normalization, seems to be only for Atari games.
            - Weight decay. Not sure how to do this.
            - Action discretization. For now, it adds extra complexity.

        Final weights are saved and can be pre-loaded elsewhere.
        """
        args = self.args
        t_start = time.time()

        for i in range(args.es_iters):
            if (i % args.log_every_t_iter == 0):
                print("\n************ Iteration %i ************"%i)
            stats = defaultdict(list)

            # Set stuff up for perturbing weights and determining fitness.
            weights_old = self.sess.run(self.weights_v) # Shape (numw,)
            eps_nw = np.random.randn(args.npop/2, self.num_ws)
            scores_n2 = []

            for j in range(args.npop/2):
                # Mirrored sampling, positive case, +eps_j.
                weights_new_pos = weights_old + args.sigma * eps_nw[j]
                self.sess.run(self.set_params_op, 
                              feed_dict={self.new_weights_v: weights_new_pos})
                rews_pos = self._compute_return()

                # Mirrored sampling, negative case, -eps_j.
                weights_new_neg = weights_old - args.sigma * eps_nw[j]
                self.sess.run(self.set_params_op, 
                              feed_dict={self.new_weights_v: weights_new_neg})
                rews_neg = self._compute_return()

                scores_n2.append([rews_pos,rews_neg])

            # Determine the new weights based on OpenAI's rank updating.
            proc_returns_n2 = utils.compute_centered_ranks(np.array(scores_n2))
            F_n = proc_returns_n2[:,0] - proc_returns_n2[:,1]
            grad = np.dot(eps_nw.T, F_n)

            # Apply the gradient update. TODO: Change this to ADAM.
            alpha = (args.lrate_es / (args.sigma*args.npop))
            next_weights = weights_old + alpha * grad
            self.sess.run(self.set_params_op, 
                          feed_dict={self.new_weights_v: next_weights})
            
            # Report relevant logs.
            if (i % args.log_every_t_iter == 0):
                hours = (time.time()-t_start) / (60*60.)

                # Test roll-outs with these new weights.
                returns = []
                for _ in range(args.test_trajs):
                    returns.append(self._compute_return(test=True))

                logz.log_tabular("FinalAvgReturns",  np.mean(returns))
                logz.log_tabular("FinalStdReturns",  np.std(returns))
                logz.log_tabular("FinalMaxReturns",  np.max(returns))
                logz.log_tabular("FinalMinReturns",  np.min(returns))
                logz.log_tabular("ScoresAvg",        np.mean(scores_n2))
                logz.log_tabular("ScoresStd",        np.std(scores_n2))
                logz.log_tabular("ScoresMax",        np.max(scores_n2))
                logz.log_tabular("ScoresMin",        np.min(scores_n2))
                logz.log_tabular("TotalTimeHours",   hours)
                logz.log_tabular("TotalIterations",  i)
                logz.dump_tabular()

            # Save the weights so I can test them later.
            if (i % args.snapshot_every_t_iter == 0):
                itr = str(i).zfill(len(str(abs(args.es_iters))))
                with open(self.log_dir+'/snapshots/weights_'+itr+'.pkl', 'wb') as f:
                    pickle.dump(next_weights, f)

        # Save the *final* weights.
        itr = str(i).zfill(len(str(abs(args.es_iters))))
        with open(self.log_dir+'/snapshots/weights_'+itr+'.pkl', 'wb') as f:
            pickle.dump(next_weights, f)
Ejemplo n.º 3
0
    def step(self):
        """Run a step in ES.

        1. kick off all actors to synchronize weights and sample data;
        2. update parameters of the model based on sampled data.
        3. update global observation filter based on local filters of all actors, and synchronize global 
           filter to all actors.
        """
        num_episodes, num_timesteps = 0, 0
        results = []

        while num_episodes < self.config['min_episodes_per_batch'] or \
                num_timesteps < self.config['min_steps_per_batch']:
            # Send sample signal to all actors
            for q in self.actors_signal_input_queues:
                q.put({'signal': 'sample'})

            # Collect results from all actors
            for q in self.actors_output_queues:
                result = q.get()
                results.append(result)
                # result['noisy_lengths'] is a list of lists, where the inner lists have length 2.
                num_episodes += sum(
                    len(pair) for pair in result['noisy_lengths'])
                num_timesteps += sum(
                    sum(pair) for pair in result['noisy_lengths'])

        all_noise_indices = []
        all_training_rewards = []
        all_training_lengths = []
        all_eval_rewards = []
        all_eval_lengths = []

        for result in results:
            all_eval_rewards.extend(result['eval_rewards'])
            all_eval_lengths.extend(result['eval_lengths'])

            all_noise_indices.extend(result['noise_indices'])
            all_training_rewards.extend(result['noisy_rewards'])
            all_training_lengths.extend(result['noisy_lengths'])

        assert len(all_eval_rewards) == len(all_eval_lengths)
        assert (len(all_noise_indices) == len(all_training_rewards) ==
                len(all_training_lengths))

        self.sample_total_episodes += num_episodes
        self.sample_total_steps += num_timesteps

        eval_rewards = np.array(all_eval_rewards)
        eval_lengths = np.array(all_eval_lengths)
        noise_indices = np.array(all_noise_indices)
        noisy_rewards = np.array(all_training_rewards)
        noisy_lengths = np.array(all_training_lengths)

        # normalize rewards to (-0.5, 0.5)
        proc_noisy_rewards = utils.compute_centered_ranks(noisy_rewards)
        noises = [
            self.noise.get(index, self.agent.weights_total_size)
            for index in noise_indices
        ]

        # Update the parameters of the model.
        self.agent.learn(proc_noisy_rewards, noises)
        self.latest_flat_weights = self.agent.get_flat_weights()

        # Update obs filter
        self._update_filter()

        # Store the evaluate rewards
        if len(all_eval_rewards) > 0:
            self.eval_rewards_stat.add(np.mean(eval_rewards))
            self.eval_lengths_stat.add(np.mean(eval_lengths))

        metrics = {
            "episodes_this_iter": noisy_lengths.size,
            "sample_total_episodes": self.sample_total_episodes,
            'sample_total_steps': self.sample_total_steps,
            "evaluate_rewards_mean": self.eval_rewards_stat.mean,
            "evaluate_steps_mean": self.eval_lengths_stat.mean,
            "timesteps_this_iter": noisy_lengths.sum(),
        }

        self.log_metrics(metrics)
        return metrics
Ejemplo n.º 4
0
def run():
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    gym.logger.set_level(40)
    env = gym.make(args.env_name)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    state_stat = RunningStat(env.observation_space.shape, eps=1e-2)
    action_space = env.action_space
    policy = Policy(state_size, action_size, args.hidden_size,
                    action_space.low, action_space.high)
    num_params = policy.num_params
    optim = Adam(num_params, args.lr)

    ray.init(num_cpus=args.num_parallel)

    return_list = []
    for epoch in range(100000):
        #####################################
        ### Rollout and Update State Stat ###
        #####################################

        policy.set_state_stat(state_stat.mean, state_stat.std)

        # set diff params (mirror sampling)
        assert args.episodes_per_batch % 2 == 0
        diff_params = torch.empty((args.episodes_per_batch, num_params),
                                  dtype=torch.float)
        diff_params_pos = torch.randn(args.episodes_per_batch // 2,
                                      num_params) * args.noise_std
        diff_params[::2] = diff_params_pos
        diff_params[1::2] = -diff_params_pos

        rets = []
        num_episodes_popped = 0
        num_timesteps_popped = 0
        while num_episodes_popped < args.episodes_per_batch \
                and num_timesteps_popped < args.timesteps_per_batch:
            #or num_timesteps_popped < args.timesteps_per_batch:
            results = []
            for i in range(min(args.episodes_per_batch, 500)):
                # set policy
                randomized_policy = deepcopy(policy)
                randomized_policy.add_params(diff_params[num_episodes_popped +
                                                         i])
                # rollout
                results.append(
                    rollout.remote(randomized_policy,
                                   args.env_name,
                                   seed=np.random.randint(0, 10000000)))

            for result in results:
                ret, timesteps, states = ray.get(result)
                rets.append(ret)
                # update state stat
                if states is not None:
                    state_stat.increment(states.sum(axis=0),
                                         np.square(states).sum(axis=0),
                                         states.shape[0])

                num_timesteps_popped += timesteps
                num_episodes_popped += 1
        rets = np.array(rets, dtype=np.float32)
        diff_params = diff_params[:num_episodes_popped]

        best_policy_idx = np.argmax(rets)
        best_policy = deepcopy(policy)
        best_policy.add_params(diff_params[best_policy_idx])
        best_rets = [
            rollout.remote(best_policy,
                           args.env_name,
                           seed=np.random.randint(0, 10000000),
                           calc_state_stat_prob=0.0,
                           test=True) for _ in range(10)
        ]
        best_rets = np.average(ray.get(best_rets))

        print('epoch:', epoch, 'mean:', np.average(rets), 'max:', np.max(rets),
              'best:', best_rets)
        with open(args.outdir + '/return.csv', 'w') as f:
            return_list.append(
                [epoch, np.max(rets),
                 np.average(rets), best_rets])
            writer = csv.writer(f, lineterminator='\n')
            writer.writerows(return_list)

            plt.figure()
            sns.lineplot(data=np.array(return_list)[:, 1:])
            plt.savefig(args.outdir + '/return.png')
            plt.close('all')

        #############
        ### Train ###
        #############

        fitness = compute_centered_ranks(rets).reshape(-1, 1)
        if args.weight_decay > 0:
            #l2_decay = args.weight_decay * ((policy.get_params() + diff_params)**2).mean(dim=1, keepdim=True).numpy()
            l1_decay = args.weight_decay * (policy.get_params() +
                                            diff_params).mean(
                                                dim=1, keepdim=True).numpy()
            fitness += l1_decay
        grad = (fitness * diff_params.numpy()).mean(axis=0)
        policy = optim.update(policy, -grad)
Ejemplo n.º 5
0
def run():
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    gym.logger.set_level(40)
    env = gym.make(args.env_name)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    state_stat = RunningStat(
        env.observation_space.shape,
        eps=1e-2
    )
    action_space = env.action_space
    policy = Policy(state_size, action_size, args.hidden_size, action_space.low, action_space.high)
    num_params = policy.num_params
    es = cma.CMAEvolutionStrategy([0] * num_params,
                                    args.sigma_init,
                                    {'popsize': args.popsize,
                                        })
    
    ray.init(num_cpus=args.num_parallel)

    return_list = []
    for epoch in range(100000):
        #####################################
        ### Rollout and Update State Stat ###
        #####################################

        solutions = np.array(es.ask(), dtype=np.float32)
        policy.set_state_stat(state_stat.mean, state_stat.std)

        rets = []
        results = []
        for i in range(args.popsize):
            # set policy
            randomized_policy = deepcopy(policy)
            randomized_policy.set_params(solutions[i])
            # rollout
            results.append(rollout.remote(randomized_policy, args.env_name, seed=np.random.randint(0,10000000)))
        
        for result in results:
            ret, timesteps, states = ray.get(result)
            rets.append(ret)
            # update state stat
            if states is not None:
                state_stat.increment(states.sum(axis=0), np.square(states).sum(axis=0), states.shape[0])
            
        rets = np.array(rets, dtype=np.float32)
        
        best_policy_idx = np.argmax(rets)
        best_policy = deepcopy(policy)
        best_policy.set_params(solutions[best_policy_idx])
        best_rets = [rollout.remote(best_policy, args.env_name, seed=np.random.randint(0,10000000), calc_state_stat_prob=0.0, test=True) for _ in range(10)]
        best_rets = np.average(ray.get(best_rets))
        
        print('epoch:', epoch, 'mean:', np.average(rets), 'max:', np.max(rets), 'best:', best_rets)
        with open(args.outdir + '/return.csv', 'w') as f:
            return_list.append([epoch, np.max(rets), np.average(rets), best_rets])
            writer = csv.writer(f, lineterminator='\n')
            writer.writerows(return_list)

            plt.figure()
            sns.lineplot(data=np.array(return_list)[:,1:])
            plt.savefig(args.outdir + '/return.png')
            plt.close('all')
        

        #############
        ### Train ###
        #############

        ranks = compute_centered_ranks(rets)
        fitness = ranks
        if args.weight_decay > 0:
            l2_decay = compute_weight_decay(args.weight_decay, solutions)
            fitness -= l2_decay
        # convert minimize to maximize
        es.tell(solutions, fitness)
Ejemplo n.º 6
0
            # Update ob stats.
            if policy.needs_ob_stat and result.ob_count > 0:
                ob_stat.increment(result.ob_sum, result.ob_sumsq,
                                  result.ob_count)
                ob_count_this_batch += result.ob_count

        # Assemble the results.
        noise_inds_n = np.concatenate(
            [r.noise_inds_n for r in curr_task_results])
        returns_n2 = np.concatenate([r.returns_n2 for r in curr_task_results])
        lengths_n2 = np.concatenate([r.lengths_n2 for r in curr_task_results])
        assert noise_inds_n.shape[0] == returns_n2.shape[
            0] == lengths_n2.shape[0]
        # Process the returns.
        if config.return_proc_mode == "centered_rank":
            proc_returns_n2 = utils.compute_centered_ranks(returns_n2)
        else:
            raise NotImplementedError(config.return_proc_mode)

        # Compute and take a step.
        g, count = utils.batched_weighted_sum(
            proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
            (noise.get(idx, policy.num_params) for idx in noise_inds_n),
            batch_size=500)
        g /= returns_n2.size
        assert (g.shape == (policy.num_params, ) and g.dtype == np.float32
                and count == len(noise_inds_n))
        update_ratio = optimizer.update(-g + config.l2coeff * theta)

        # Update ob stat (we're never running the policy in the master, but we
        # might be snapshotting the policy).