class CERL_Agent:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (int): Parameter class with all the parameters

	"""
    def __init__(self,
                 args):  # need to intialize rollout_workers to have blue agent
        self.args = args
        self.evolver = SSNE(
            self.args)  # this evolver implements neuro-evolution

        # MP TOOLS
        self.manager = Manager()

        self.mutate_algos = [
            Mutation_Add(self),
            Mutation_Delete(self),
            Mutation_Exchange(self)
        ]  #store all the mutate algorithm objects
        # Genealogy tool
        self.genealogy = Genealogy()

        # Init BUFFER
        self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

        #if SA_FLAG:
        self.metrics = []
        self.last_portfolio = None
        self.T_max = 30
        self.T = self.T_max
        self.T_min = 0.2
        self.decay_rate = 0.975

        # Initialize population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            wwid = self.genealogy.new_id('evo')
            if ALGO == 'SAC':
                self.pop.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, wwid))
            elif ALGO == 'TD3':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
                # use ALGO to distinguish differe net architecture
            elif ALGO == 'dis' or 'TD3_tennis':
                self.pop.append(
                    Actor(args.state_dim, args.action_dim, wwid, ALGO))
            else:
                assert False, "invalid algorithm type"

        if ALGO == "SAC":
            self.best_policy = GaussianPolicy(args.state_dim, args.action_dim,
                                              args.hidden_size, -1)
        else:
            self.best_policy = Actor(args.state_dim, args.action_dim, -1, ALGO)
            if ALGO == 'dis':
                self.average_policy = AverageActor(args.state_dim,
                                                   args.action_dim,
                                                   -2,
                                                   ALGO,
                                                   self.pop,
                                                   self.replay_buffer,
                                                   args.buffer_gpu,
                                                   args.batch_size,
                                                   iterations=10)
                self.average_policy.share_memory()

        self.best_policy.share_memory()

        # added by macheng, share the best policy accross processes (used as internal belief update models for blue)

        # now we assign shared blue_trainer, we should train this agent such that the roll_out workers are also up to date
        # should make sure that self.best_policy (emergent learner) is also shared
        if ALGO == 'dis' or 'TD3_tennis':
            assert hasattr(
                args, "blue_trainer"
            ), "must have blue_agent trainer to intialize rollout_worker, see line 109, class Parameter definition"
        if ALGO == 'dis':
            trainers = [args.blue_trainer, self.average_policy]
        else:
            trainers = [args.blue_trainer, None
                        ] if ALGO == 'TD3_tennis' else []

        self.trainers = trainers

        self.blue_dqn = args.blue_trainer

        # Turn off gradients and put in eval mod
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()
        # Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy, PORTFOLIO_ID)
        self.complement_portfolio = [
        ]  #complementary of the portfolio, whatever not in the portfolio should be stored here
        self.total_rollout_bucket = self.manager.list(
        )  #macheng: we use total_rollout_bucket to represents the whole set of rollout models, now rollout_bukcet dynamically resize according to portforlio, for SA
        self.rollout_bucket = self.total_rollout_bucket
        #self.rollout_bucket = self.manager.list()
        #print("rollout_bucker needs to be updated, main.py line 239 ")
        for _ in range(len(self.portfolio)):
            if ALGO == 'SAC':
                self.rollout_bucket.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, -1))
            else:
                self.rollout_bucket.append(
                    Actor(args.state_dim, args.action_dim, -1, ALGO))
        # Initialize shared data bucket
        self.data_bucket = self.replay_buffer.tuples

        ############## MULTIPROCESSING TOOLS ###################
        # Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, 0, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.pop, ENV_NAME, None, ALGO,
                          self.trainers)) for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        # Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 1, self.task_pipes[id][1],
                          self.result_pipes[id][0], True, self.data_bucket,
                          self.rollout_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        # Test bucket
        self.test_bucket = self.manager.list()
        if ALGO == 'SAC':
            self.test_bucket.append(
                GaussianPolicy(args.state_dim, args.action_dim,
                               args.hidden_size, -1))
        else:
            self.test_bucket.append(
                Actor(args.state_dim, args.action_dim, -1, ALGO))

        # 5 Test workers
        self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, 2, self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False, None,
                          self.test_bucket, ENV_NAME, args.noise_std, ALGO,
                          self.trainers)) for id in range(TEST_SIZE)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        # Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)
        # self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

        # Trackers
        self.best_score = -np.inf
        self.gen_frames = 0
        self.total_frames = 0
        self.best_shaped_score = None
        self.test_score = None
        self.test_std = None

        # trainer contains the blue_dqn to be trained, and the red model used for belief update, red_actor is the actual red agent trained against
        # id is the actual red agent id

    def _update_SA_temperature(self):
        self.T = max(self.T * self.decay_rate, self.T_min)

    def _get_accept_rate(self):
        if RANDOM_WALK:
            return 1.0
        else:
            if self.metrics[-1] > self.metrics[-2]:
                return 1.0
            else:
                return np.exp((self.metrics[-1] - self.metrics[-2]) / self.T)

    def _mutate(self):
        while True:
            mutate_algo_index = random.choice(range(3))
            if self._try_mutate(mutate_algo_index):
                return

    def _try_mutate(self,
                    algo_index):  # 0 for add, 1 for delete, 2 for exchange
        return self.mutate_algos[algo_index].try_mutate()

    def simulated_annealing(self, metric):  #take in the current metric
        self.metrics.append(metric)
        if self.last_portfolio:  #has last_portfolio
            accept_rate = self._get_accept_rate()  #based on self.metrics[-2:]
            self._update_SA_temperature()
            if np.random.random() > accept_rate:  #reject
                self.portfolio = self.last_portfolio
                self.complement_portfolio = self.last_complement_portfolio

        self.last_portfolio = copy.copy(
            self.portfolio)  #maintain a shallow copy as
        self.last_complement_portfolio = copy.copy(self.complement_portfolio)
        self._mutate()  #perturb the portfolio
        # update rollout_bucket size, only the first len(self.portfolio) rollout_buckets are visible
        self.update_rollout_bucket()
        # update allocation, to be compatible with the current portfolio
        self.update_allocation()

    def update_rollout_bucket(self):
        self.rollout_bucket = self.total_rollout_bucket[:len(self.portfolio)]

    def train_blue_dqn(
        self,
        trainers,
        env_name,
        gen,
        ALGO='dis',
        pomdp_adv=False
    ):  #in this method, rollout and training are done together, opponent sampled from the population
        NUM_EPISODE = 100  #train 100 episodes for the blue to converge to the new best response to red
        EPS_START = max(1.0 * 0.5**(gen - 10),
                        0.15) if gen >= 10 else 1.0  #initial epsilon
        EPS_END = 0.05
        EPS_DECAY = 0.995

        if ALGO == 'dis':  # make env with blue and red policy agent inside,
            assert trainers is not None
            dis_env = make_self_play_env(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                return_policy_agent=False,
                trainers=trainers
            )[0]  # trainer if not None, first is the shared DQN agent, second is the best red policy
            env = EnvironmentWrapper(
                env_name, ALGO, dis_env,
                0)  # the "0" is the index for training blue agent
        elif ALGO == 'TD3_tennis':
            no_graphics = not RENDER
            tennis_env = make_tennis_env.TennisEnvFactory(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                no_graphics=no_graphics,
                pid=-1).getEnv()[0]
            env = EnvironmentWrapper('Tennis', ALGO, tennis_env, 0)
        else:
            env = EnvironmentWrapper(env_name, ALGO)

        blue_dqn = trainers[0]
        average_reward = 0
        eps = EPS_START

        average_red_reward = 0
        red_count = 0
        average_actual_blue_reward = 0
        blue_count = 0

        for it in range(NUM_EPISODE):
            if not pomdp_adv:  #if pomdp_adv, make sure that TD3_actor is never used
                id = np.random.choice(np.array(range(len(self.pop))))
                red_actor = self.pop[id]
                env.set_TD3_actor(red_actor)

            fitness = 0.0
            #here fitness if simplely reward
            total_frame = 0
            state = env.reset()
            env.randomize_neu_adv()

            if pomdp_adv:
                env.try_set_pomdp_adv(
                )  #try to set if opponent to pomdp adv if opponent is adversary, else do nothing

            render_flag = (np.random.random() < 0.05)
            while True:  # unless done

                action = blue_dqn.act(state, eps=eps)
                # action = utils.to_numpy(action)

                next_state, reward, done, info = env.step(
                    copy.deepcopy(action), use_actual_reward=DRQN
                )  #after calling env.step, evaluator initialized later does not work
                #should be something wrong with the internal red model?
                blue_dqn.step(state, action, reward, next_state, done)

                if render_flag and self.args.render:
                    env.render()
                # next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0)
                state = next_state
                fitness += reward
                total_frame += 1

                # DONE FLAG IS Received
                if done:
                    average_red_reward += env.get_red_reward(
                    ) if env.get_red_reward() is not None else 0
                    average_actual_blue_reward += env.get_blue_actual_reward(
                    ) if env.get_blue_actual_reward() is not None else 0
                    red_count += 1 if env.get_red_reward() is not None else 0
                    blue_count += 1 if env.get_blue_actual_reward(
                    ) is not None else 0
                    if render_flag: env.env.close()
                    break

            average_reward += fitness
            eps = max(EPS_END, EPS_DECAY * eps)

        if gen >= 10 and gen % 5 == 0:
            blue_dqn.save_net('./pytorch_models/train_blue_dqn_step_' +
                              str(gen) + '.pth')

        average_reward /= NUM_EPISODE
        if red_count != 0:
            average_red_reward /= red_count
        if blue_count != 0:
            average_actual_blue_reward /= blue_count
        return average_reward, average_red_reward, average_actual_blue_reward

    def evaluate_training_fixed_blue(
            self):  #this evaluate against the training opponent (red pop)
        self.evaluator.pomdp_adv = False
        return self.evaluator.evaluate_fixed_agents(self.trainers[0],
                                                    self.trainers[1], self.pop)

    def train(self, gen, frame_tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        # Start Evolution rollouts
        if not ISOLATE_PG:
            for id, actor in enumerate(self.pop):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send((id, gen))
                    self.evo_flag[id] = False

        # Sync all learners actor to cpu (rollout) actor
        # (update rollout parameter using the learner parameter, such that rollout worker is up to date)
        for i, learner in enumerate(self.portfolio):  #number of learner
            learner.algo.actor.cpu()
            utils.hard_update(
                self.rollout_bucket[i], learner.algo.actor
            )  #rollout bucket is now synchronized with learner to perform rollout for learner actors
            if torch.cuda.is_available(): learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(
                self.allocation):  #number of rollout_size
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(
                    (learner_id, gen)
                )  #allocation record the id of the learner that bucket should run, so rollout_id is the id of rollout_bucket
                self.roll_flag[rollout_id] = False

        # Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send((0, gen))

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        # main training loop
        if self.replay_buffer.__len__(
        ) > self.args.batch_size * 10:  ###BURN IN PERIOD
            self.replay_buffer.tensorify(
            )  # Tensorify the buffer for fast sampling

            # Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.buffer_gpu,
                          self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]  #macheng: do we want to train all the learners?

            # Start threads
            for thread in threads:
                thread.start()

            # Join threads
            for thread in threads:
                thread.join()

            # Now update average_policy
            #self.average_policy.cuda()
            if ALGO == 'dis':
                self.average_policy.update(
                )  #update the average_policy parameter with supervised learning

            self.gen_frames = 0

            #########Visualize Learner Critic Function#################
            # if self.replay_buffer.__len__() % 2500 == 0:
            #	visualize_critic(self.portfolio[2], make_self_play_env(trainers=[[],[]])[0], 50)  #arguments: Learner, env, N_GRID

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if not ISOLATE_PG:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        for i in range(self.args.rollout_size):
            entry = self.result_pipes[i][1].recv()
            learner_id = entry[0]
            fitness = entry[1]
            num_frames = entry[2]
            self.portfolio[learner_id].update_stats(fitness, num_frames)

            self.gen_frames += num_frames
            self.total_frames += num_frames
            if fitness > self.best_score: self.best_score = fitness

            self.roll_flag[i] = True

        # Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        # ms:best policy is always up to date
        # so here the best learner is saved
        if not ISOLATE_PG:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0], self.pop[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy, self.pop[champ_index])
                if SAVE:
                    torch.save(
                        self.pop[champ_index].state_dict(),
                        self.args.aux_folder + ENV_NAME + '_best' + SAVETAG)
                    print("Best policy saved with score",
                          '%.2f' % max(all_fitness))

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))

            # Update score to trackers
            frame_tracker.update([test_mean], self.total_frames)
        else:
            test_mean, test_std = None, None

        # NeuroEvolution's probabilistic selection and recombination step
        # ms: this epoch() method implements neuro-evolution
        if not ISOLATE_PG:  #seems pop_size and rollout_size must be 10, otherwise this will produce error
            if gen % 5 == 0:
                self.evolver.epoch(
                    gen, self.genealogy, self.pop, all_net_ids, all_fitness,
                    self.rollout_bucket
                )  #this method also copies learner to evoler
            else:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, [])

        # META LEARNING - RESET ALLOCATION USING UCB
        if gen % 1 == 0:
            self.update_allocation()
        # Metrics
        if not ISOLATE_PG:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            champ_wwid = int(self.rollout_bucket[0].wwid.item())
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid

    def update_allocation(self):
        self.allocation = ucb(len(self.allocation), self.portfolio,
                              self.args.ucb_coefficient)

    def sim_and_eval_POMDP(self):
        self.evaluator = Evaluator(
            self, 5, self.trainers,
            pomdp_adv=True)  # evaluator must be created before train_dqn
        for gen in range(1000000):
            print('gen=', gen)
            blue_score, red_score, actual_blue_score = agent.train_blue_dqn(
                agent.trainers, ENV_NAME, gen, ALGO='dis', pomdp_adv=True)
            print('Env', ENV_NAME, 'Gen', gen,
                  ", Training average: Blue agent score: ", blue_score,
                  " Red score: ", red_score, " Actual blue score: ",
                  actual_blue_score)
            blue_score, red_score, actual_blue_score = self.evaluator.evaluate(
            )
            print("Evaluation result: Blue agent score: ", blue_score,
                  " Red score: ", red_score, " Actual blue score: ",
                  actual_blue_score)
Exemple #2
0
class CERL_Agent:
    """Main CERL class containing all methods for CERL

		Parameters:
		args (int): Parameter class with all the parameters

	"""
    def __init__(self, args):
        self.args = args
        self.evolver = SSNE(self.args)

        #MP TOOLS
        self.manager = Manager()

        #Genealogy tool
        self.genealogy = Genealogy()

        #Initialize population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            wwid = self.genealogy.new_id('evo')
            if ALGO == 'SAC':
                self.pop.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, wwid))
            else:
                self.pop.append(Actor(args.state_dim, args.action_dim, wwid))

        if ALGO == "SAC":
            self.best_policy = GaussianPolicy(args.state_dim, args.action_dim,
                                              args.hidden_size, -1)
        else:
            self.best_policy = Actor(args.state_dim, args.action_dim, -1)

        #Turn off gradients and put in eval mod
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()

        #Init BUFFER
        self.replay_buffer = Buffer(1000000, self.args.buffer_gpu)

        #Intialize portfolio of learners
        self.portfolio = []
        self.portfolio = initialize_portfolio(self.portfolio, self.args,
                                              self.genealogy, PORTFOLIO_ID)
        self.rollout_bucket = self.manager.list()
        for _ in range(len(self.portfolio)):
            if ALGO == 'SAC':
                self.rollout_bucket.append(
                    GaussianPolicy(args.state_dim, args.action_dim,
                                   args.hidden_size, -1))
            else:
                self.rollout_bucket.append(
                    Actor(args.state_dim, args.action_dim, -1))

        # Initialize shared data bucket
        self.data_bucket = self.replay_buffer.tuples

        ############## MULTIPROCESSING TOOLS ###################

        #Evolutionary population Rollout workers
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(id, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.pop, ENV_NAME, None, ALGO))
            for id in range(args.pop_size)
        ]
        for worker in self.evo_workers:
            worker.start()
        self.evo_flag = [True for _ in range(args.pop_size)]

        #Learner rollout workers
        self.task_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.result_pipes = [Pipe() for _ in range(args.rollout_size)]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, self.task_pipes[id][1], self.result_pipes[id][0],
                          True, self.data_bucket, self.rollout_bucket,
                          ENV_NAME, args.noise_std, ALGO))
            for id in range(args.rollout_size)
        ]
        for worker in self.workers:
            worker.start()
        self.roll_flag = [True for _ in range(args.rollout_size)]

        #Test bucket
        self.test_bucket = self.manager.list()
        if ALGO == 'SAC':
            self.test_bucket.append(
                GaussianPolicy(args.state_dim, args.action_dim,
                               args.hidden_size, -1))
        else:
            self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1))

        #5 Test workers
        self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)]
        self.test_workers = [
            Process(target=rollout_worker,
                    args=(id, self.test_task_pipes[id][1],
                          self.test_result_pipes[id][0], False, None,
                          self.test_bucket, ENV_NAME, args.noise_std, ALGO))
            for id in range(TEST_SIZE)
        ]
        for worker in self.test_workers:
            worker.start()
        self.test_flag = False

        #Meta-learning controller (Resource Distribution)
        self.allocation = [
        ]  #Allocation controls the resource allocation across learners
        for i in range(args.rollout_size):
            self.allocation.append(
                i % len(self.portfolio))  #Start uniformly (equal resources)
        #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores

        #Trackers
        self.best_score = 0.0
        self.gen_frames = 0
        self.total_frames = 0
        self.best_shaped_score = None
        self.test_score = None
        self.test_std = None

    def train(self, gen, frame_tracker):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

			Parameters:
				gen (int): Current epoch of training

			Returns:
				None
		"""
        ################ START ROLLOUTS ##############

        #Start Evolution rollouts
        if not ISOLATE_PG:
            for id, actor in enumerate(self.pop):
                if self.evo_flag[id]:
                    self.evo_task_pipes[id][0].send(id)
                    self.evo_flag[id] = False

        #Sync all learners actor to cpu (rollout) actor
        for i, learner in enumerate(self.portfolio):
            learner.algo.actor.cpu()
            utils.hard_update(self.rollout_bucket[i], learner.algo.actor)
            learner.algo.actor.cuda()

        # Start Learner rollouts
        for rollout_id, learner_id in enumerate(self.allocation):
            if self.roll_flag[rollout_id]:
                self.task_pipes[rollout_id][0].send(learner_id)
                self.roll_flag[rollout_id] = False

        #Start Test rollouts
        if gen % 5 == 0:
            self.test_flag = True
            for pipe in self.test_task_pipes:
                pipe[0].send(0)

        ############# UPDATE PARAMS USING GRADIENT DESCENT ##########
        if self.replay_buffer.__len__(
        ) > self.args.batch_size * 10:  ###BURN IN PERIOD
            self.replay_buffer.tensorify(
            )  # Tensorify the buffer for fast sampling

            #Spin up threads for each learner
            threads = [
                threading.Thread(
                    target=learner.update_parameters,
                    args=(self.replay_buffer, self.args.buffer_gpu,
                          self.args.batch_size,
                          int(self.gen_frames * self.args.gradperstep)))
                for learner in self.portfolio
            ]

            # Start threads
            for thread in threads:
                thread.start()

            #Join threads
            for thread in threads:
                thread.join()
            self.gen_frames = 0

        ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############
        if not ISOLATE_PG:
            all_fitness = []
            all_net_ids = []
            all_eplens = []
            while True:
                for i in range(self.args.pop_size):
                    if self.evo_result_pipes[i][1].poll():
                        entry = self.evo_result_pipes[i][1].recv()
                        all_fitness.append(entry[1])
                        all_net_ids.append(entry[0])
                        all_eplens.append(entry[2])
                        self.gen_frames += entry[2]
                        self.total_frames += entry[2]
                        self.evo_flag[i] = True

                # Soft-join (50%)
                if len(all_fitness
                       ) / self.args.pop_size >= self.args.asynch_frac:
                    break

        ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############
        for i in range(self.args.rollout_size):
            entry = self.result_pipes[i][1].recv()
            learner_id = entry[0]
            fitness = entry[1]
            num_frames = entry[2]
            self.portfolio[learner_id].update_stats(fitness, num_frames)

            self.gen_frames += num_frames
            self.total_frames += num_frames
            if fitness > self.best_score: self.best_score = fitness

            self.roll_flag[i] = True

        #Referesh buffer (housekeeping tasks - pruning to keep under capacity)
        self.replay_buffer.referesh()
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        if not ISOLATE_PG:
            champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
            utils.hard_update(self.test_bucket[0], self.pop[champ_index])
            if max(all_fitness) > self.best_score:
                self.best_score = max(all_fitness)
                utils.hard_update(self.best_policy, self.pop[champ_index])
                if SAVE:
                    torch.save(
                        self.pop[champ_index].state_dict(),
                        self.args.aux_folder + ENV_NAME + '_best' + SAVETAG)
                    print("Best policy saved with score",
                          '%.2f' % max(all_fitness))

        else:  #Run PG in isolation
            utils.hard_update(self.test_bucket[0], self.rollout_bucket[0])

        ###### TEST SCORE ######
        if self.test_flag:
            self.test_flag = False
            test_scores = []
            for pipe in self.test_result_pipes:  #Collect all results
                entry = pipe[1].recv()
                test_scores.append(entry[1])
            test_scores = np.array(test_scores)
            test_mean = np.mean(test_scores)
            test_std = (np.std(test_scores))

            # Update score to trackers
            frame_tracker.update([test_mean], self.total_frames)
        else:
            test_mean, test_std = None, None

        #NeuroEvolution's probabilistic selection and recombination step
        if not ISOLATE_PG:
            if gen % 5 == 0:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, self.rollout_bucket)
            else:
                self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids,
                                   all_fitness, [])

        #META LEARNING - RESET ALLOCATION USING UCB
        if gen % 1 == 0:
            self.allocation = ucb(len(self.allocation), self.portfolio,
                                  self.args.ucb_coefficient)

        #Metrics
        if not ISOLATE_PG:
            champ_len = all_eplens[all_fitness.index(max(all_fitness))]
            champ_wwid = int(self.pop[champ_index].wwid.item())
            max_fit = max(all_fitness)
        else:
            champ_len = num_frames
            champ_wwid = int(self.rollout_bucket[0].wwid.item())
            all_fitness = [fitness]
            max_fit = fitness
            all_eplens = [num_frames]

        return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid
Exemple #3
0
class ERL_Agent:
    """Main ERL class containing all methods for CERL

        Parameters:
        args (int): Parameter class with all the parameters

    """
    def __init__(self, args):
        self.args = args
        self.evolver = SSNE(self.args)

        #MP TOOLS
        self.manager = Manager()

        #Init population
        self.pop = self.manager.list()
        for _ in range(args.pop_size):
            self.pop.append(Actor(args))
            #self.pop[-1].apply(utils.init_weights)
        self.best_policy = Actor(args)
        #Turn off gradients and put in eval mode
        for actor in self.pop:
            actor = actor.cpu()
            actor.eval()

        if SEED_POP: self.load_seed(args.model_save, self.pop)

        #Init BUFFER
        self.replay_buffer = Buffer(100000, self.args.data_folder)

        #MP TOOLS
        self.exp_list = self.manager.list()
        self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)]
        self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)]

        self.evo_workers = [
            Process(target=rollout_worker,
                    args=(i, self.evo_task_pipes[i][1],
                          self.evo_result_pipes[i][1], None, self.exp_list,
                          self.pop, DIFFICULTY, USE_RS, True,
                          USE_SYNTHETIC_TARGET, XBIAS, ZBIAS, PHASE_LEN, None,
                          EP_LEN, JGS)) for i in range(args.pop_size)
        ]

        for worker in self.evo_workers:
            worker.start()

        #Trackers
        self.buffer_added = 0
        self.best_score = 0.0
        self.frames_seen = 0.0
        self.best_shaped_score = None
        self.eval_flag = [True for _ in range(args.pop_size)]

    def load_seed(self, dir, pop):
        """Read models from drive and sync it into the population

            Parameters:
                  dir (str): Folder location to pull models from
                  pop (shared_list): population of models

            Returns:
                None


        """
        list_files = os.listdir(dir)
        print(list_files)
        for i, model in enumerate(list_files):
            try:
                pop[i].load_state_dict(torch.load(dir + model))
                pop[i].eval()
            except:
                print(model, 'Failed to load')

    def add_experience(self, state, action, next_state, reward, done_probs,
                       done):
        """Process and send experiences to be added to the buffer

              Parameters:
                  state (ndarray): Current State
                  next_state (ndarray): Next State
                  action (ndarray): Action
                  reward (ndarray): Reward
                  done_dist (ndarray): Temporal distance to done (#action steps after which the skselton fell over)
                  done (ndarray): Done

              Returns:
                  None
          """

        self.buffer_added += 1
        self.replay_buffer.push(state, next_state, action, reward, done_probs,
                                done)
        if self.buffer_added % 100000 == 0: self.replay_buffer.save()

    def train(self, gen):
        """Main training loop to do rollouts, neureoevolution, and policy gradients

            Parameters:
                gen (int): Current epoch of training

            Returns:
                None
        """

        ################ ROLLOUTS ##############
        #Start Evo rollouts
        for id, actor in enumerate(self.pop):
            if self.eval_flag[id]:
                self.evo_task_pipes[id][0].send(True)
                self.eval_flag[id] = False

        ########## SOFT -JOIN ROLLOUTS ############
        all_fitness = []
        all_net_ids = []
        all_eplens = []
        all_shaped_fitness = []
        while True:
            for i in range(self.args.pop_size):
                if self.evo_result_pipes[i][0].poll():
                    entry = self.evo_result_pipes[i][0].recv()
                    all_fitness.append(entry[1])
                    all_net_ids.append(entry[0])
                    all_eplens.append(entry[2])
                    self.frames_seen += entry[2]
                    all_shaped_fitness.append(entry[3])
                    self.eval_flag[i] = True

            # Soft-join (50%)
            if len(all_fitness) / self.args.pop_size >= self.args.asynch_frac:
                break

        # Add ALL EXPERIENCE COLLECTED TO MEMORY concurrently
        for _ in range(len(self.exp_list)):
            exp = self.exp_list.pop()
            self.add_experience(exp[0], exp[1], exp[2], exp[3], exp[4], exp[5])
        ######################### END OF PARALLEL ROLLOUTS ################

        ############ PROCESS MAX FITNESS #############
        champ_index = all_net_ids[all_fitness.index(max(all_fitness))]
        if max(all_fitness) > self.best_score:
            self.best_score = max(all_fitness)
            utils.hard_update(self.best_policy, self.pop[champ_index])
            if SAVE:
                torch.save(self.pop[champ_index].state_dict(),
                           self.args.model_save + 'erl_best' + SAVE_TAG)
                print("Best policy saved with score",
                      '%.2f' % max(all_fitness))

        #Save champion periodically
        if gen % 5 == 0 and max(all_fitness) > (self.best_score -
                                                100) and SAVE:
            torch.save(self.pop[champ_index].state_dict(),
                       self.args.model_save + 'champ' + SAVE_TAG)
            torch.save(self.pop[champ_index].state_dict(),
                       self.args.rl_models + 'champ' + SAVE_TAG)
            print("Champ saved with score ", '%.2f' % max(all_fitness))

        if gen % 20 == 0 and SAVE:
            torch.save(
                self.pop[self.evolver.lineage.index(max(
                    self.evolver.lineage))].state_dict(),
                self.args.model_save + 'eugenic_champ' + SAVE_TAG)
            print("Eugenic Champ saved with score ",
                  '%.2f' % max(self.evolver.lineage))

        if USE_RS:
            all_shaped_fitness = np.array(all_shaped_fitness)
            if self.best_shaped_score == None:
                self.best_shaped_score = [
                    0.0 for _ in range(all_shaped_fitness.shape[1])
                ]  #First time run (set the best shaped score size to track a variable # of shaped fitnesses)

            max_shaped_fit = [max(a) for a in all_shaped_fitness.transpose()]

            for metric_id in range(len(max_shaped_fit)):

                if max_shaped_fit[metric_id] > self.best_shaped_score[
                        metric_id]:
                    self.best_shaped_score[metric_id] = max_shaped_fit[
                        metric_id]
                    shaped_champ_ind = all_net_ids[np.argmax(
                        all_shaped_fitness[:, metric_id])]
                    if SAVE:
                        torch.save(
                            self.pop[shaped_champ_ind].state_dict(),
                            self.args.model_save + 'shaped_erl_best' +
                            str(metric_id) + SAVE_TAG)
                        print(
                            "Best Shaped ERL policy saved with true score",
                            '%.2f' % all_fitness[np.argmax(
                                all_shaped_fitness[:, metric_id])],
                            'and shaped score of ',
                            '%.2f' % max_shaped_fit[metric_id],
                            'for metric id', str(metric_id))

        else:
            max_shaped_fit = None

        #NeuroEvolution's probabilistic selection and recombination step
        self.evolver.epoch(self.pop, all_net_ids, all_fitness,
                           all_shaped_fitness)

        # Synch RL Agent to NE periodically
        if gen % 5 == 0:
            self.evolver.sync_rl(self.args.rl_models, self.pop)

        return max(all_fitness), all_eplens[all_fitness.index(
            max(all_fitness))], all_fitness, all_eplens, all_shaped_fitness