Ejemplo n.º 1
0
 def renew_learner(
         self
 ):  #create a new learning agent, with randomized initial parameter
     self.learner = Learner(-1,
                            self.algo,
                            self.state_dim,
                            self.action_dim,
                            actor_lr=5e-5,
                            critic_lr=1e-3,
                            gamma=0.99,
                            tau=5e-3,
                            init_w=True,
                            **self.td3args)
     self.actual_red_actor = self.learner.algo.actor
Ejemplo n.º 2
0
def train(flags):  # pylint: disable=too-many-branches, too-many-statements
    ray.init()

    if flags.xpid is None:
        flags.xpid = "torchbeast-%s" % time.strftime("%Y%m%d-%H%M%S")

    flags.replay_batch_size = int(flags.batch_size * flags.replay_ratio)

    stat_keys = [
        "total_loss",
        "mean_episode_return",
        "pg_loss",
        "baseline_loss",
        "entropy_loss",
    ]
    logger = logging.getLogger("logfile")

    flags.device = None
    if not flags.disable_cuda and torch.cuda.is_available():
        logger.error("Using CUDA.")
        flags.device = torch.device("cuda")
    else:
        logger.error("Not using CUDA.")
        flags.device = torch.device("cpu")

    env = create_env(flags)

    actors = []
    for i in range(flags.num_actors):
        actors.append(Actor.remote(
            flags,
            i,
        ))

    learner = Learner.remote(flags, actors, env.observation_space.shape[0],
                             env.action_space.n, stat_keys)
    learner_handle = learner.train.remote()

    ray.wait([learner_handle])
    ray.wait([actors[0].print_timings.remote()])
Ejemplo n.º 3
0
    def __init__(self):
        """Parameter class stores all parameters for policy gradient

		Parameters:
			None

		Returns:
			None
		"""

        self.seed = SEED
        self.asynch_frac = 1.0  # Aynchronosity of NeuroEvolution
        self.algo = ALGO
        self.drqn = DRQN
        self.isolate_pg = ISOLATE_PG

        self.render = RENDER
        self.batch_size = BATCHSIZE  # Batch size
        self.noise_std = 0.1  # Gaussian noise exploration std
        self.ucb_coefficient = 0.25  # ms: was 0.9 #Exploration coefficient in UCB
        self.gradperstep = GRADPERSTEP
        self.buffer_gpu = BUFFER_GPU
        self.rollout_size = ROLLOUT_SIZE  # Size of learner rollouts

        # NeuroEvolution stuff
        self.pop_size = POP_SIZE
        self.elite_fraction = 0.2
        self.crossover_prob = 0.15
        self.mutation_prob = 0.90

        #######unused########
        self.extinction_prob = 0.005  # Probability of extinction event
        # Probabilty of extinction for each genome, given an extinction event
        self.extinction_magnituide = 0.5
        self.weight_magnitude_limit = 10000000
        self.mut_distribution = 1  # 1-Gaussian, 2-Laplace, 3-Uniform

        # Save Results
        if ALGO == 'dis':
            a = make_self_play_env(trainers=[[], []])
            # actually does not need trainers, only want blue_agent_trainer
            dummy_env, blue_agent_trainer = make_self_play_env(
                trainers=[[], []], blue_use_drqn=DRQN)
            # blue_agent_trainer this is actually two trainer
            self.blue_trainer = blue_agent_trainer[0]
            self.blue_trainer.share_memory()
            self.action_dim = dummy_env.action_dim
            self.state_dim = dummy_env.state_dim
            self.action_low = 0
            self.action_high = 1
        elif ALGO == 'TD3_tennis':
            no_graphics = not RENDER
            dummy_env, self.action_dim, self.state_dim = make_tennis_env.TennisEnvFactory(
                seed=SEED, no_graphics=no_graphics, pid=-1).getEnv()
            self.action_low = -1.0
            self.action_high = +1.0  #according to unity document
            td3args = {
                'policy_noise': 0.2,
                'policy_noise_clip': 0.5,
                'policy_ups_freq': 2,
                'action_low': self.action_low,
                'action_high': self.action_high,
                'cerl_args': self
            }
            self.blue_trainer = Learner(-1,
                                        'TD3',
                                        self.state_dim,
                                        self.action_dim,
                                        actor_lr=5e-5,
                                        critic_lr=1e-3,
                                        gamma=0.99,
                                        tau=5e-3,
                                        init_w=True,
                                        **td3args)
            self.blue_trainer.share_memory()
        else:
            dummy_env = gym.make(ENV_NAME)
            self.state_dim = dummy_env.observation_space.shape[0]
            self.action_dim = dummy_env.action_space.shape[0]
            self.action_low = float(dummy_env.action_space.low[0])
            self.action_high = float(dummy_env.action_space.high[0])
        self.savefolder = 'Results/'
        if not os.path.exists('Results/'): os.makedirs('Results/')
        if not os.path.exists('pytorch_models/'):
            os.makedirs('pytorch_models/')
        self.aux_folder = self.savefolder + 'Auxiliary/'
        if not os.path.exists(self.aux_folder): os.makedirs(self.aux_folder)
Ejemplo n.º 4
0
def initialize_portfolio(portfolio, args, genealogy, portfolio_id):
    """Portfolio of learners

        Parameters:
            portfolio (list): Incoming list
            args (object): param class

        Returns:
            portfolio (list): Portfolio of learners
    """

    if portfolio_id == 10:
        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 1
        wwid = genealogy.new_id('learner_1')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.9,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

        # Learner 3
        wwid = genealogy.new_id('learner_3')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.99,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

        # Learner 4
        wwid = genealogy.new_id('learner_4')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.997,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

        # Learner 4
        wwid = genealogy.new_id('learner_4')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.9995,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    if portfolio_id == 11:
        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 1
        wwid = genealogy.new_id('learner_1')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.9,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    if portfolio_id == 12:
        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 1
        wwid = genealogy.new_id('learner_1')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.99,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    if portfolio_id == 13:
        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 1
        wwid = genealogy.new_id('learner_1')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.997,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    if portfolio_id == 14:
        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 1
        wwid = genealogy.new_id('learner_1')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.9995,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    ##############MOTIVATING EXAMPLE #######
    if portfolio_id == 100:

        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 1
        wwid = genealogy.new_id('learner_1')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.0,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

        # Learner 2
        wwid = genealogy.new_id('learner_2')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=1.0,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    if portfolio_id == 101:
        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 3
        wwid = genealogy.new_id('learner_3')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=0.0,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    if portfolio_id == 102:
        td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': args.action_low,
            'action_high': args.action_high
        }

        # Learner 1
        wwid = genealogy.new_id('learner_1')
        portfolio.append(
            Learner(wwid,
                    'TD3',
                    args.state_dim,
                    args.action_dim,
                    actor_lr=1e-3,
                    critic_lr=1e-3,
                    gamma=1.0,
                    tau=5e-3,
                    init_w=True,
                    **td3args))

    return portfolio
Ejemplo n.º 5
0
class Evaluator(object):
    def __init__(
        self,
        CERL_agent,
        num_workers,
        trainers,
        pomdp_adv=False
    ):  #trainers first is the blue agent and second is the red model
        self.num_workers = num_workers
        self.trainers = trainers
        self.pomdp_adv = pomdp_adv
        self.args = CERL_agent.args
        self.drqn = CERL_agent.args.drqn  #denote if blue uses drqn
        if self.pomdp_adv:
            self.trainers = [trainers[0],
                             None]  #make sure the red model is never used
        self.buffer_gpu = CERL_agent.args.buffer_gpu
        self.batch_size = CERL_agent.args.batch_size
        self.algo = CERL_agent.args.algo
        self.state_dim = CERL_agent.args.state_dim
        self.action_dim = CERL_agent.args.action_dim
        self.buffer = Buffer(BUFFER_SIZE,
                             self.buffer_gpu)  #initialize own replay buffer
        self.data_bucket = self.buffer.tuples
        self.evo_task_pipes = [Pipe() for _ in range(self.num_workers)]
        self.evo_result_pipes = [Pipe() for _ in range(self.num_workers)]
        self.actual_red_worker = Actor(
            CERL_agent.args.state_dim, CERL_agent.args.action_dim, -1,
            'dis')  #this model is shared accross the workers
        self.actual_red_worker.share_memory()
        self.td3args = {
            'policy_noise': 0.2,
            'policy_noise_clip': 0.5,
            'policy_ups_freq': 2,
            'action_low': CERL_agent.args.action_low,
            'action_high': CERL_agent.args.action_high,
            'cerl_args': self.args
        }
        self.renew_learner(
        )  #now we are not using new learner for each iteration
        self.rollout_bucket = [
            self.actual_red_worker for i in range(num_workers)
        ]
        self.workers = [
            Process(target=rollout_worker,
                    args=(id, 3, self.evo_task_pipes[id][1],
                          self.evo_result_pipes[id][0], False,
                          self.data_bucket, self.rollout_bucket, 'dummy_name',
                          None, 'dis', self.trainers, False, self.pomdp_adv))
            for id in range(num_workers)
        ]

        for worker in self.workers:
            worker.start()
        self.evo_flag = [True for _ in range(self.num_workers)]

    #def initialize(self, actor_in):  #use the given actor parameter to initialize the red actor
    #    utils.hard_update(self.actual_red_actor, actor_in)

    def renew_learner(
            self
    ):  #create a new learning agent, with randomized initial parameter
        self.learner = Learner(-1,
                               self.algo,
                               self.state_dim,
                               self.action_dim,
                               actor_lr=5e-5,
                               critic_lr=1e-3,
                               gamma=0.99,
                               tau=5e-3,
                               init_w=True,
                               **self.td3args)
        self.actual_red_actor = self.learner.algo.actor

    def collect_trajectory(self):
        utils.hard_update(self.actual_red_worker,
                          self.actual_red_actor)  #first snyc the actor

        #launch rollout_workers
        for id, actor in enumerate(self.rollout_bucket):
            if self.evo_flag[id]:
                self.evo_task_pipes[id][0].send(
                    (id, 0))  #second argument in send is dummy
                self.evo_flag[id] = False

        #wait for the rollout to complete and record fitness
        all_fitness = []
        for i in range(self.num_workers):
            entry = self.evo_result_pipes[i][1].recv()
            all_fitness.append(entry[1])
            self.evo_flag[i] = True

        self.buffer.referesh()  #update replay buffer

        return all_fitness

    def train_red(
        self, training_iterations
    ):  #alternate between collect_trajectory and parameter update
        while self.buffer.__len__() < self.batch_size * 10:  ###BURN IN PERIOD
            self.collect_trajectory()

        for i in range(training_iterations):
            self.collect_trajectory()
            self.buffer.tensorify()  # Tensorify the buffer for fast sampling
            self.learner.update_parameters(self.buffer, self.buffer_gpu,
                                           self.batch_size, 2)  #2 update steps

    def evaluate(
        self
    ):  #evaluate the quality of blue agent policy, by training a red against it, after evaluation, erase the reply buffer and renew learner
        self.train_red(TRAIN_ITERATION)
        self.clear_buffer()
        #self.renew_learner()
        return self.evaluate_fixed_agents(
            self.trainers[0], self.trainers[1],
            [self.actual_red_actor
             ])  #calculate the mean and std of the evaluation metric

    def evaluate_fixed_agents(
        self,
        blue_dqn,
        red_model,
        red_actor_list,
        num_iterations=25
    ):  #evaluate the performance given agents, use random neutral and red agent
        if self.algo == 'dis':  # make env with blue and red policy agent inside,
            dis_env = make_self_play_env(
                seed=0,
                return_policy_agent=False,
                trainers=[blue_dqn, red_model]
            )[0]  # trainer if not None, first is the shared DQN agent, second is the best red policy
            env = EnvironmentWrapper(
                '', self.algo, dis_env,
                0)  # the "0" is the index for training blue agent
        elif self.algo == 'TD3_tennis':
            tennis_env = make_tennis_env.TennisEnvFactory(
                seed=np.random.choice(np.array(range(len(self.pop)))),
                no_graphics=True,
                pid=-1).getEnv()[0]
            env = EnvironmentWrapper('Tennis', self.algo, tennis_env, 0)
        else:
            raise Exception("only work for 'dis' envir?")
        average_reward = 0
        eps = 0
        average_red_reward = 0
        red_count = 0
        average_actual_blue_reward = 0
        blue_count = 0
        belief_and_true_type_list = []
        assert len(red_actor_list
                   ) is not None, "make sure to input a list of possible red"
        for it in range(num_iterations):
            belief_and_true_type = []
            if not self.pomdp_adv:  # if pomdp_adv, make sure that TD3_actor is never used
                red_actor = random.choice(red_actor_list)
                env.set_TD3_actor(red_actor)
            fitness = 0.0
            # here fitness if simplely reward
            state = env.reset()
            belief_and_true_type.append(env.belief_and_true_type())
            env.randomize_neu_adv()

            if self.pomdp_adv:
                env.try_set_pomdp_adv(
                )  # try to set if opponent to pomdp adv if opponent is adversary, else do nothing

            render_flag = (np.random.random() < 0.05)
            while True:  # unless done
                action = blue_dqn.act(state, eps=eps)
                next_state, reward, done, info = env.step(
                    copy.deepcopy(action), use_actual_reward=self.drqn)
                belief_and_true_type.append(env.belief_and_true_type())
                if render_flag and self.args.render:
                    env.render()

                state = next_state
                fitness += reward

                if done:
                    average_red_reward += env.get_red_reward(
                    ) if env.get_red_reward() is not None else 0
                    average_actual_blue_reward += env.get_blue_actual_reward(
                    ) if env.get_blue_actual_reward() is not None else 0
                    red_count += 1 if env.get_red_reward() is not None else 0
                    blue_count += 1 if env.get_blue_actual_reward(
                    ) is not None else 0
                    if render_flag: env.env.close()
                    break
            belief_and_true_type_list.append(belief_and_true_type)
            average_reward += fitness
        average_reward /= num_iterations
        if red_count != 0:
            average_red_reward /= red_count
        if blue_count != 0:
            average_actual_blue_reward /= blue_count
        return average_reward, average_red_reward, average_actual_blue_reward, belief_and_true_type_list

    def clear_buffer(self):
        self.buffer.clear_buffer_data()  #reinitialize replay buffer

    def kill_processes(self):
        for id, actor in enumerate(self.rollout_bucket):
            self.evo_task_pipes[id][0].send(
                ('TERMINATE', 0))  #second argument in send is dummy

    def __del__(self):
        self.kill_processes()