コード例 #1
0
ファイル: covar_gmm.py プロジェクト: flowersteam/TeachMyAgent
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 absolute_lp=False,
                 fit_rate=250,
                 potential_ks=np.arange(2, 11, 1),
                 random_task_ratio=0.2,
                 nb_bootstrap=None,
                 initial_dist=None):
        '''
            Covar - Gaussian Mixture Model.
            Implementation of IGMM (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3893575/) + minor improvements.

            Args:
                absolute_lp: Original version does not use Absolute LP, only LP.
                fit_rate: Number of episodes between two fit of the GMM
                potential_ks: Range of number of Gaussians to try when fitting the GMM
                random_task_ratio: Ratio of randomly sampled tasks VS tasks sampling using GMM
                nb_bootstrap: Number of bootstrapping episodes, must be >= to fit_rate
                initial_dist: Initial Gaussian distribution. If None, bootstrap with random tasks
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)

        # Range of number of Gaussians to try when fitting the GMM
        self.potential_ks = potential_ks
        # Ratio of randomly sampled tasks VS tasks sampling using GMM
        self.random_task_ratio = random_task_ratio
        self.random_task_generator = Box(self.mins,
                                         self.maxs,
                                         dtype=np.float32)
        self.random_task_generator.seed(self.seed)

        # Number of episodes between two fit of the GMM
        self.fit_rate = fit_rate
        self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate  # Number of bootstrapping episodes, must be >= to fit_rate
        self.initial_dist = initial_dist  # Initial Gaussian distribution. If None, bootstrap with random tasks

        # Original version does not use Absolute LP, only LP.
        self.absolute_lp = absolute_lp

        self.tasks = []
        self.tasks_times_rewards = []
        self.all_times = np.arange(0, 1, 1 / self.fit_rate)
        self.gmm = None

        # boring book-keeping
        self.bk = {
            'weights': [],
            'covariances': [],
            'means': [],
            'tasks_lps': [],
            'episodes': [],
            'tasks_origin': []
        }
コード例 #2
0
ファイル: goal_gan.py プロジェクト: meln1k/TeachMyAgent
    def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, state_noise_level, success_distance_threshold,
                 update_size,  n_rollouts=2, goid_lb=0.25, goid_ub=0.75, p_old=0.2, use_pretrained_samples=False,
                 initial_dist=None):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed)

        np.random.seed(self.seed) # To seed the GAN (sufficient ?)
        tf.set_random_seed(
            seed
        )

        tf_config = tf.ConfigProto(
            allow_soft_placement=True,
            inter_op_parallelism_threads=1,
            intra_op_parallelism_threads=1)
        # Prevent tensorflow from taking all the gpu memory
        tf_config.gpu_options.allow_growth = True
        self.tf_session = tf.Session(config=tf_config)
        self.gan = StateGAN(
            state_size=len(mins),
            evaluater_size=1,
            state_range=0.5 * (self.maxs - self.mins) + 1e-6, # avoid normalization issues for dimensions where min==max
            state_center=mins + 0.5 * (self.maxs - self.mins),
            state_noise_level=(state_noise_level * (self.maxs - self.mins))[None, :],
            generator_layers=[256, 256],
            discriminator_layers=[128, 128],
            noise_size=self.mins.shape[0],
            tf_session=self.tf_session,
            configs={"supress_all_logging": True}
        )
        self.tf_session.run(tf.initialize_local_variables())
        self.replay_noise = state_noise_level * (self.maxs - self.mins)
        self.success_buffer = StateCollection(1, success_distance_threshold * np.linalg.norm(self.maxs - self.mins))

        self.update_size = update_size
        self.contexts = []
        self.labels = []

        self.p_old = p_old
        self.n_rollouts = n_rollouts
        self.goid_lb = goid_lb
        self.goid_ub = goid_ub

        self.pending_contexts = {}
        self.context_queue = Queue()
        self.episode_counter = 0

        if use_pretrained_samples:
            print("Pretraining GAN...")
            initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True)
            pretrain_samples = self.random_state.multivariate_normal(initial_mean, initial_variance, size=1000)
            pretrain_samples = np.clip(pretrain_samples, mins, maxs, dtype=np.float32)
            self.gan.pretrain(pretrain_samples)

        self.bk = {'dis_log_loss': [],
                   'gen_log_loss': [],
                   'episodes': []}
コード例 #3
0
    def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, step_size, max_reward_thr, min_reward_thr,
                 initial_dist=None, boundary_sampling_p=0.5, queue_len=10, scale_reward=False):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed)
        self.nb_dims = len(self.mins)

        # Boundary sampling probability p_r
        self.bound_sampling_p = boundary_sampling_p

        # ADR step size
        self.step_size = step_size

        # Max reward threshold, sampling distribution inflates if mean reward above this
        self.max_reward_threshold = max_reward_thr
        if scale_reward:
            self.max_reward_threshold = np.interp(self.max_reward_threshold,
                                                  (self.env_reward_lb, self.env_reward_ub),
                                                  (0, 1))

        # Min reward threshold, sampling distribution deflates if mean reward below this
        self.min_reward_threshold = min_reward_thr
        if scale_reward:
            self.min_reward_threshold = np.interp(self.min_reward_threshold,
                                                  (self.env_reward_lb, self.env_reward_ub),
                                                  (0, 1))

        # max queue length
        self.window_len = queue_len

        # Set initial task space to predefined calibrated task
        initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True)
        # self.cur_mins = []
        # self.cur_maxs = []
        # for i in range(len(mins)): # 10% of each dimension
        #     current_min = initial_mean[i] - 0.05*(maxs[i]-mins[i]) # mean - 5% of the dimension
        #     current_max = initial_mean[i] + 0.05*(maxs[i]-mins[i]) # mean + 5% of the dimension
        #     self.cur_mins.append(max(mins[i], current_min))
        #     self.cur_maxs.append(min(maxs[i], current_max))

        # Single task version (as the original paper)
        self.cur_mins = initial_mean
        self.cur_maxs = initial_mean

        self.cur_mins = np.array(self.cur_mins, dtype=np.float32)  # current min bounds
        self.cur_maxs = np.array(self.cur_maxs, dtype=np.float32)  # current max bounds
        self.task_space = Box(self.cur_mins, self.cur_maxs, dtype=np.float32)
        self.task_space.seed(self.seed)

        # Init queues, one per task space dimension
        self.min_queues = [deque(maxlen=self.window_len) for _ in range(self.nb_dims)]
        self.max_queues = [deque(maxlen=self.window_len) for _ in range(self.nb_dims)]

        # Boring book-keeping
        self.episode_nb = 0
        self.bk = {'task_space': [(self.cur_mins.copy(),self.cur_maxs.copy())],
                   'episodes': []}
コード例 #4
0
    def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub):
        '''
            Random teacher sampling tasks uniformly random over the task space.
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)

        self.random_task_generator = Box(np.array(mins),
                                         np.array(maxs),
                                         dtype=np.float32)
        self.random_task_generator.seed(self.seed)
コード例 #5
0
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 update_frequency=100,
                 setter_loss_noise_ub=0.01,
                 setter_hidden_size=128):
        '''
            Setter-Solver (https://arxiv.org/abs/1909.12892).
            Made with the help of https://drive.google.com/drive/folders/1yjhztFeX67tHEImXCiP_UAQfQ-wFvV4Y.

            Args:
                update_frequency: How many episodes between two updates of the neural networks
                setter_loss_noise_ub: Upper bound of the noise added to tasks in the Setter's loss
                setter_hidden_size: Number of neurons in the Setter's layers
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)
        self.nb_dims = len(self.mins)

        tf.set_random_seed(seed)
        tf_config = tf.ConfigProto()
        # Prevent tensorflow from taking all the gpu memory
        tf_config.gpu_options.allow_growth = True
        self.tf_session = tf.Session(config=tf_config)

        self.update_frequency = update_frequency
        self.episode_counter = 0

        self.judge = Judge(hidden_sizes=[64, 64, 64],
                           tf_session=self.tf_session,
                           goal_size=self.nb_dims)
        self.setter = FlatRnvp(latent_size=self.nb_dims,
                               num_blocks=3,
                               num_layers_per_block=3,
                               tf_session=self.tf_session,
                               judge_output_op=self.judge._mlp,
                               hidden_size=setter_hidden_size,
                               final_non_linearity=ClippedSigmoid,
                               loss_noise_ub=setter_loss_noise_ub,
                               random_state=self.random_state)

        self.tf_session.run(tf.global_variables_initializer())

        self.goal_buffer = GoalBuffer()
        self.pending_goals = []
        self.bk = {'judge_loss': [], 'setter_loss': [], 'episodes': []}
コード例 #6
0
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 absolute_lp=False,
                 fit_rate=250,
                 potential_ks=np.arange(2, 11, 1),
                 random_task_ratio=0.2,
                 nb_bootstrap=None,
                 initial_dist=None):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)

        # Range of number of Gaussians to try when fitting the GMM
        self.potential_ks = potential_ks
        # Ratio of randomly sampled tasks VS tasks sampling using GMM
        self.random_task_ratio = random_task_ratio
        self.random_task_generator = Box(self.mins,
                                         self.maxs,
                                         dtype=np.float32)
        self.random_task_generator.seed(self.seed)

        # Number of episodes between two fit of the GMM
        self.fit_rate = fit_rate
        self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate  # Number of bootstrapping episodes, must be >= to fit_rate
        self.initial_dist = initial_dist  # Initial Gaussian distribution. If None, bootstrap with random tasks

        # Original version does not use Absolute LP, only LP.
        self.absolute_lp = absolute_lp

        self.tasks = []
        self.tasks_times_rewards = []
        self.all_times = np.arange(0, 1, 1 / self.fit_rate)
        self.gmm = None

        # boring book-keeping
        self.bk = {
            'weights': [],
            'covariances': [],
            'means': [],
            'tasks_lps': [],
            'episodes': [],
            'tasks_origin': []
        }
コード例 #7
0
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 update_frequency=100,
                 setter_loss_noise_ub=0.01,
                 setter_hidden_size=128):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)
        self.nb_dims = len(self.mins)

        tf.set_random_seed(seed)
        tf_config = tf.ConfigProto()
        # Prevent tensorflow from taking all the gpu memory
        tf_config.gpu_options.allow_growth = True
        self.tf_session = tf.Session(config=tf_config)

        self.update_frequency = update_frequency
        self.episode_counter = 0

        self.judge = Judge(hidden_sizes=[64, 64, 64],
                           tf_session=self.tf_session,
                           goal_size=self.nb_dims)
        self.setter = FlatRnvp(latent_size=self.nb_dims,
                               num_blocks=3,
                               num_layers_per_block=3,
                               tf_session=self.tf_session,
                               judge_output_op=self.judge._mlp,
                               hidden_size=setter_hidden_size,
                               final_non_linearity=ClippedSigmoid,
                               loss_noise_ub=setter_loss_noise_ub,
                               random_state=self.random_state)

        self.tf_session.run(tf.global_variables_initializer())

        self.goal_buffer = GoalBuffer()
        self.pending_goals = []
        self.bk = {'judge_loss': [], 'setter_loss': [], 'episodes': []}
コード例 #8
0
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 max_region_size=200,
                 alp_window_size=None,
                 nb_split_attempts=50,
                 sampling_in_leaves_only=False,
                 min_region_size=None,
                 min_dims_range_ratio=1 / 6,
                 discard_ratio=1 / 4):

        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)

        # Maximal number of (task, reward) pairs a region can hold before splitting
        self.maxlen = max_region_size

        self.alp_window = self.maxlen if alp_window_size is None else alp_window_size

        # Initialize Regions' tree
        self.tree = Tree()
        self.regions_bounds = [Box(self.mins, self.maxs, dtype=np.float32)]
        self.regions_alp = [0.]
        self.tree.create_node('root',
                              'root',
                              data=Region(maxlen=self.maxlen,
                                          r_t_pairs=[
                                              deque(maxlen=self.maxlen + 1),
                                              deque(maxlen=self.maxlen + 1)
                                          ],
                                          bounds=self.regions_bounds[-1],
                                          alp=self.regions_alp[-1]))
        self.nb_dims = len(mins)
        self.nb_split_attempts = nb_split_attempts

        # Whether task sampling uses parent and child regions (False) or only child regions (True)
        self.sampling_in_leaves_only = sampling_in_leaves_only

        # Additional tricks to original RIAC, enforcing splitting rules

        # 1 - Minimum population required for both children when splitting --> set to 1 to cancel
        self.minlen = self.maxlen / 20 if min_region_size is None else min_region_size

        # 2 - minimum children region size (compared to initial range of each dimension)
        # Set min_dims_range_ratio to 1/np.inf to cancel
        self.dims_ranges = self.maxs - self.mins
        self.min_dims_range_ratio = min_dims_range_ratio

        # 3 - If after nb_split_attempts, no split is valid, flush oldest points of parent region
        # If 1- and 2- are canceled, this will be canceled since any split will be valid
        self.discard_ratio = discard_ratio

        # book-keeping
        self.sampled_tasks = []
        self.all_boxes = []
        self.all_alps = []
        self.update_nb = -1
        self.split_iterations = []

        self.hyperparams = locals()
コード例 #9
0
ファイル: adr.py プロジェクト: flowersteam/TeachMyAgent
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 step_size,
                 max_reward_thr,
                 min_reward_thr,
                 initial_dist=None,
                 boundary_sampling_p=0.5,
                 queue_len=10,
                 scale_reward=False):
        '''
            Automatic Domain Randomization (https://arxiv.org/abs/1910.07113).

            Args:
                step_size: Size of the growth (or decrease) of a bound at update
                max_reward_thr: Upper reward threshold used to inflate distribution
                min_reward_thr: Lowers reward threshold used to deflate distribution
                initial_dist: The mean of this initial distribution is used as the initial task used by ADR
                boundary_sampling_p: Probability to sample a dimension at a bound
                queue_len: Size of the queue associated to each bound. Once reached, ADR increases or decreases the bound.
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)
        self.nb_dims = len(self.mins)

        # Boundary sampling probability p_r
        self.bound_sampling_p = boundary_sampling_p

        # ADR step size
        self.step_size = step_size

        # Max reward threshold, sampling distribution inflates if mean reward above this
        self.max_reward_threshold = max_reward_thr
        if scale_reward:
            self.max_reward_threshold = np.interp(
                self.max_reward_threshold,
                (self.env_reward_lb, self.env_reward_ub), (0, 1))

        # Min reward threshold, sampling distribution deflates if mean reward below this
        self.min_reward_threshold = min_reward_thr
        if scale_reward:
            self.min_reward_threshold = np.interp(
                self.min_reward_threshold,
                (self.env_reward_lb, self.env_reward_ub), (0, 1))

        # max queue length
        self.window_len = queue_len

        # Set initial task space to predefined calibrated task
        initial_mean, initial_variance = self.get_or_create_dist(initial_dist,
                                                                 mins,
                                                                 maxs,
                                                                 subspace=True)

        # Single task version (as the original paper)
        self.cur_mins = initial_mean
        self.cur_maxs = initial_mean

        self.cur_mins = np.array(self.cur_mins,
                                 dtype=np.float32)  # current min bounds
        self.cur_maxs = np.array(self.cur_maxs,
                                 dtype=np.float32)  # current max bounds
        self.task_space = Box(self.cur_mins, self.cur_maxs, dtype=np.float32)
        self.task_space.seed(self.seed)

        # Init queues, one per task space dimension
        self.min_queues = [
            deque(maxlen=self.window_len) for _ in range(self.nb_dims)
        ]
        self.max_queues = [
            deque(maxlen=self.window_len) for _ in range(self.nb_dims)
        ]

        # Boring book-keeping
        self.episode_nb = 0
        self.bk = {
            'task_space': [(self.cur_mins.copy(), self.cur_maxs.copy())],
            'episodes': []
        }
コード例 #10
0
    def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, update_frequency, update_offset, alpha_function, initial_dist=None,
                 target_dist=None, max_kl=0.1, std_lower_bound=None, kl_threshold=None,  cg_parameters=None,
                 use_avg_performance=False, max_context_buffer_size=1000, reset_contexts=True, discount_factor=0.99):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed)
        torch.manual_seed(self.seed)

        initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True) # Random subspace of the task space if no intial dist
        target_mean, target_variance = self.get_or_create_dist(target_dist, mins, maxs, subspace=False) # Full task space if no intial dist

        context_bounds = (np.array(mins), np.array(maxs))

        self.update_frequency = update_frequency
        self.update_offset = update_offset
        self.step_counter = 0
        self.discounted_sum_reward = 0
        self.discount_factor = discount_factor
        self.discounted_sum_rewards = []
        self.current_disc = 1
        self.pending_initial_state = None
        self.algorithm_iterations = 0

        # The bounds that we show to the outside are limited to the interval [-1, 1], as this is typically better for
        # neural nets to deal with
        self.context_buffer = Buffer(2, max_context_buffer_size, reset_contexts)
        self.context_dim = target_mean.shape[0]
        self.context_bounds = context_bounds
        self.use_avg_performance = use_avg_performance

        if std_lower_bound is not None and kl_threshold is None:
            raise RuntimeError("Error! Both Lower Bound on standard deviation and kl threshold need to be set")
        else:
            if std_lower_bound is not None:
                if isinstance(std_lower_bound, np.ndarray):
                    if std_lower_bound.shape[0] != self.context_dim:
                        raise RuntimeError("Error! Wrong dimension of the standard deviation lower bound")
                elif std_lower_bound is not None:
                    std_lower_bound = np.ones(self.context_dim) * std_lower_bound
            self.std_lower_bound = std_lower_bound
            self.kl_threshold = kl_threshold

        # Create the initial context distribution
        if isinstance(initial_variance, np.ndarray):
            flat_init_chol = GaussianTorchDistribution.flatten_matrix(initial_variance, tril=False)
        else:
            flat_init_chol = GaussianTorchDistribution.flatten_matrix(initial_variance * np.eye(self.context_dim),
                                                                      tril=False)

        # Create the target distribution
        if isinstance(target_variance, np.ndarray):
            flat_target_chol = GaussianTorchDistribution.flatten_matrix(target_variance, tril=False)
        else:
            flat_target_chol = GaussianTorchDistribution.flatten_matrix(target_variance * np.eye(self.context_dim),
                                                                        tril=False)

        AbstractSelfPacedTeacher.__init__(self, initial_mean, flat_init_chol, target_mean, flat_target_chol,
                                               alpha_function, max_kl, cg_parameters)
        self.bk = {'mean': [],
                   'covariance': [],
                   'steps': [],
                   'algo_iterations': [],
                   'kl': []}
コード例 #11
0
    def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub):
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed)

        self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32)
        self.random_task_generator.seed(self.seed)
コード例 #12
0
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 update_frequency,
                 update_offset,
                 alpha_function,
                 initial_dist=None,
                 target_dist=None,
                 max_kl=0.1,
                 std_lower_bound=None,
                 kl_threshold=None,
                 cg_parameters=None,
                 use_avg_performance=False,
                 max_context_buffer_size=1000,
                 reset_contexts=True,
                 discount_factor=0.99):
        '''
            Self-paced Deep Reinforcement Learning (https://papers.nips.cc/paper/2020/hash/68a9750337a418a86fe06c1991a1d64c-Abstract.html).
            Taken from https://github.com/psclklnk/spdl and wrapped to our architecture.

            Works in a non-episodic setup, updates are thus made in the `step_update` method.

            Args:
                update_frequency: Update frequency of the sampling distribution (in steps)
                update_offset: How many steps must be done before the starting to update the distribution
                alpha_function: Function calculating the alpha parameter
                initial_dist: Initial distribution to start from
                target_dist: Target distribution to reach
                max_kl: Maximum KL-divergence authorized between the old and new distributions when updating
                std_lower_bound: Minimum std authorized on the sampling distribution if the KL-divergence between
                                    the latter and the target distribution is greater than `kl_threshold`. Set this to
                                    `None` if no constraint on the std must be applied
                kl_threshold: Threshold enforcing the std constraint
                cg_parameters: Additional parameters for the Conjugate Gradient method
                use_avg_performance: Whether the alpha function must used the averaged performance
                max_context_buffer_size: Maximum size of the buffer storing sampled tasks
                reset_contexts: Whether the buffer should be reset when queried
                discount_factor: Discount factor used in the Universal Value Function
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)
        torch.manual_seed(self.seed)

        initial_mean, initial_variance = self.get_or_create_dist(
            initial_dist, mins, maxs, subspace=True
        )  # Random subspace of the task space if no intial dist
        target_mean, target_variance = self.get_or_create_dist(
            target_dist, mins, maxs,
            subspace=False)  # Full task space if no intial dist

        context_bounds = (np.array(mins), np.array(maxs))

        self.update_frequency = update_frequency
        self.update_offset = update_offset
        self.step_counter = 0
        self.discounted_sum_reward = 0
        self.discount_factor = discount_factor
        self.discounted_sum_rewards = []
        self.current_disc = 1
        self.pending_initial_state = None
        self.algorithm_iterations = 0

        # The bounds that we show to the outside are limited to the interval [-1, 1], as this is typically better for
        # neural nets to deal with
        self.context_buffer = Buffer(2, max_context_buffer_size,
                                     reset_contexts)
        self.context_dim = target_mean.shape[0]
        self.context_bounds = context_bounds
        self.use_avg_performance = use_avg_performance

        if std_lower_bound is not None and kl_threshold is None:
            raise RuntimeError(
                "Error! Both Lower Bound on standard deviation and kl threshold need to be set"
            )
        else:
            if std_lower_bound is not None:
                if isinstance(std_lower_bound, np.ndarray):
                    if std_lower_bound.shape[0] != self.context_dim:
                        raise RuntimeError(
                            "Error! Wrong dimension of the standard deviation lower bound"
                        )
                elif std_lower_bound is not None:
                    std_lower_bound = np.ones(
                        self.context_dim) * std_lower_bound
            self.std_lower_bound = std_lower_bound
            self.kl_threshold = kl_threshold

        # Create the initial context distribution
        if isinstance(initial_variance, np.ndarray):
            flat_init_chol = GaussianTorchDistribution.flatten_matrix(
                initial_variance, tril=False)
        else:
            flat_init_chol = GaussianTorchDistribution.flatten_matrix(
                initial_variance * np.eye(self.context_dim), tril=False)

        # Create the target distribution
        if isinstance(target_variance, np.ndarray):
            flat_target_chol = GaussianTorchDistribution.flatten_matrix(
                target_variance, tril=False)
        else:
            flat_target_chol = GaussianTorchDistribution.flatten_matrix(
                target_variance * np.eye(self.context_dim), tril=False)

        AbstractSelfPacedTeacher.__init__(self, initial_mean, flat_init_chol,
                                          target_mean, flat_target_chol,
                                          alpha_function, max_kl,
                                          cg_parameters)
        self.bk = {
            'mean': [],
            'covariance': [],
            'steps': [],
            'algo_iterations': [],
            'kl': []
        }
コード例 #13
0
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 gmm_fitness_func="aic",
                 warm_start=False,
                 nb_em_init=1,
                 fit_rate=250,
                 alp_max_size=None,
                 alp_buffer_size=500,
                 potential_ks=np.arange(2, 11, 1),
                 random_task_ratio=0.2,
                 nb_bootstrap=None,
                 initial_dist=None):
        '''
            Absolute Learning Progress - Gaussian Mixture Model (https://arxiv.org/abs/1910.07224).

            Args:
                gmm_fitness_func: Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians.
                warm_start: Restart new fit by initializing with last fit
                nb_em_init: Number of Expectation-Maximization trials when fitting
                fit_rate: Number of episodes between two fit of the GMM
                alp_max_size: Maximum number of episodes stored
                alp_buffer_size: Maximal number of episodes to account for when computing ALP
                potential_ks: Range of number of Gaussians to try when fitting the GMM
                random_task_ratio: Ratio of randomly sampled tasks VS tasks sampling using GMM
                nb_bootstrap: Number of bootstrapping episodes, must be >= to fit_rate
                initial_dist: Initial Gaussian distribution. If None, bootstrap with random tasks
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)

        # Range of number of Gaussians to try when fitting the GMM
        self.potential_ks = potential_ks
        # Restart new fit by initializing with last fit
        self.warm_start = warm_start
        # Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians.
        self.gmm_fitness_func = gmm_fitness_func
        # Number of Expectation-Maximization trials when fitting
        self.nb_em_init = nb_em_init
        # Number of episodes between two fit of the GMM
        self.fit_rate = fit_rate
        self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate  # Number of bootstrapping episodes, must be >= to fit_rate
        self.initial_dist = initial_dist  # Initial Gaussian distribution. If None, bootstrap with random tasks

        # Ratio of randomly sampled tasks VS tasks sampling using GMM
        self.random_task_ratio = random_task_ratio
        self.random_task_generator = Box(self.mins,
                                         self.maxs,
                                         dtype=np.float32)
        self.random_task_generator.seed(self.seed)

        # Maximal number of episodes to account for when computing ALP
        alp_max_size = alp_max_size
        alp_buffer_size = alp_buffer_size

        # Init ALP computer
        self.alp_computer = EmpiricalALPComputer(len(mins),
                                                 max_size=alp_max_size,
                                                 buffer_size=alp_buffer_size)

        self.tasks = []
        self.alps = []
        self.tasks_alps = []

        # Init GMMs
        self.potential_gmms = [self.init_gmm(k) for k in self.potential_ks]
        self.gmm = None

        # Boring book-keeping
        self.bk = {
            'weights': [],
            'covariances': [],
            'means': [],
            'tasks_alps': [],
            'tasks_lps': [],
            'episodes': [],
            'tasks_origin': []
        }
コード例 #14
0
    def __init__(self,
                 mins,
                 maxs,
                 seed,
                 env_reward_lb,
                 env_reward_ub,
                 state_noise_level,
                 success_distance_threshold,
                 update_size,
                 n_rollouts=2,
                 goid_lb=0.25,
                 goid_ub=0.75,
                 p_old=0.2,
                 use_pretrained_samples=False,
                 initial_dist=None):
        '''
            GoalGAN (http://proceedings.mlr.press/v80/florensa18a.html).

            Code taken from https://github.com/psclklnk/spdl + minor updates.

            Args:
                state_noise_level: Proportion of noise added to goal sampled by the GAN (used on each dimension)
                success_distance_threshold: How far a 'Goal Of Intermediate Difficulty' must be from the others to be
                                               added to the buffer (expressed percentage of each dimension)
                update_size: How many new goals must be sampled before training the GAN
                n_rollouts: How many times a goal must be proposed to the student before calculating its mean success
                goid_lb: Lower bound to consider the mean success of a goal of 'Intermediate Difficulty'
                goid_ub: Upper bound to consider the mean success of a goal of 'Intermediate Difficulty'
                p_old: Probability to sample a goal from the buffer of old goals
                use_pretrained_samples: Whether the GAN should be pretrained using samples from `initial_dist`
                initial_dist Initial distribution from which samples should be generated to pretrain the GAN
        '''
        AbstractTeacher.__init__(self, mins, maxs, env_reward_lb,
                                 env_reward_ub, seed)

        np.random.seed(self.seed)  # To seed the GAN (sufficient ?)
        tf.set_random_seed(seed)

        tf_config = tf.ConfigProto(allow_soft_placement=True,
                                   inter_op_parallelism_threads=1,
                                   intra_op_parallelism_threads=1)
        # Prevent tensorflow from taking all the gpu memory
        tf_config.gpu_options.allow_growth = True
        self.tf_session = tf.Session(config=tf_config)
        self.gan = StateGAN(
            state_size=len(mins),
            evaluater_size=1,
            state_range=0.5 * (self.maxs - self.mins) +
            1e-6,  # avoid normalization issues for dimensions where min==max
            state_center=mins + 0.5 * (self.maxs - self.mins),
            state_noise_level=(state_noise_level *
                               (self.maxs - self.mins))[None, :],
            generator_layers=[256, 256],
            discriminator_layers=[128, 128],
            noise_size=self.mins.shape[0],
            tf_session=self.tf_session,
            configs={"supress_all_logging": True})
        self.tf_session.run(tf.initialize_local_variables())
        self.replay_noise = state_noise_level * (self.maxs - self.mins)
        self.success_buffer = StateCollection(
            1,
            success_distance_threshold * np.linalg.norm(self.maxs - self.mins))

        self.update_size = update_size
        self.contexts = []
        self.labels = []

        self.p_old = p_old
        self.n_rollouts = n_rollouts
        self.goid_lb = goid_lb
        self.goid_ub = goid_ub

        self.pending_contexts = {}
        self.context_queue = Queue()
        self.episode_counter = 0

        if use_pretrained_samples:
            print("Pretraining GAN...")
            initial_mean, initial_variance = self.get_or_create_dist(
                initial_dist, mins, maxs, subspace=True)
            pretrain_samples = self.random_state.multivariate_normal(
                initial_mean, initial_variance, size=1000)
            pretrain_samples = np.clip(pretrain_samples,
                                       mins,
                                       maxs,
                                       dtype=np.float32)
            self.gan.pretrain(pretrain_samples)

        self.bk = {'dis_log_loss': [], 'gen_log_loss': [], 'episodes': []}