def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, absolute_lp=False, fit_rate=250, potential_ks=np.arange(2, 11, 1), random_task_ratio=0.2, nb_bootstrap=None, initial_dist=None): ''' Covar - Gaussian Mixture Model. Implementation of IGMM (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3893575/) + minor improvements. Args: absolute_lp: Original version does not use Absolute LP, only LP. fit_rate: Number of episodes between two fit of the GMM potential_ks: Range of number of Gaussians to try when fitting the GMM random_task_ratio: Ratio of randomly sampled tasks VS tasks sampling using GMM nb_bootstrap: Number of bootstrapping episodes, must be >= to fit_rate initial_dist: Initial Gaussian distribution. If None, bootstrap with random tasks ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) # Range of number of Gaussians to try when fitting the GMM self.potential_ks = potential_ks # Ratio of randomly sampled tasks VS tasks sampling using GMM self.random_task_ratio = random_task_ratio self.random_task_generator = Box(self.mins, self.maxs, dtype=np.float32) self.random_task_generator.seed(self.seed) # Number of episodes between two fit of the GMM self.fit_rate = fit_rate self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate # Number of bootstrapping episodes, must be >= to fit_rate self.initial_dist = initial_dist # Initial Gaussian distribution. If None, bootstrap with random tasks # Original version does not use Absolute LP, only LP. self.absolute_lp = absolute_lp self.tasks = [] self.tasks_times_rewards = [] self.all_times = np.arange(0, 1, 1 / self.fit_rate) self.gmm = None # boring book-keeping self.bk = { 'weights': [], 'covariances': [], 'means': [], 'tasks_lps': [], 'episodes': [], 'tasks_origin': [] }
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, state_noise_level, success_distance_threshold, update_size, n_rollouts=2, goid_lb=0.25, goid_ub=0.75, p_old=0.2, use_pretrained_samples=False, initial_dist=None): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) np.random.seed(self.seed) # To seed the GAN (sufficient ?) tf.set_random_seed( seed ) tf_config = tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) # Prevent tensorflow from taking all the gpu memory tf_config.gpu_options.allow_growth = True self.tf_session = tf.Session(config=tf_config) self.gan = StateGAN( state_size=len(mins), evaluater_size=1, state_range=0.5 * (self.maxs - self.mins) + 1e-6, # avoid normalization issues for dimensions where min==max state_center=mins + 0.5 * (self.maxs - self.mins), state_noise_level=(state_noise_level * (self.maxs - self.mins))[None, :], generator_layers=[256, 256], discriminator_layers=[128, 128], noise_size=self.mins.shape[0], tf_session=self.tf_session, configs={"supress_all_logging": True} ) self.tf_session.run(tf.initialize_local_variables()) self.replay_noise = state_noise_level * (self.maxs - self.mins) self.success_buffer = StateCollection(1, success_distance_threshold * np.linalg.norm(self.maxs - self.mins)) self.update_size = update_size self.contexts = [] self.labels = [] self.p_old = p_old self.n_rollouts = n_rollouts self.goid_lb = goid_lb self.goid_ub = goid_ub self.pending_contexts = {} self.context_queue = Queue() self.episode_counter = 0 if use_pretrained_samples: print("Pretraining GAN...") initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True) pretrain_samples = self.random_state.multivariate_normal(initial_mean, initial_variance, size=1000) pretrain_samples = np.clip(pretrain_samples, mins, maxs, dtype=np.float32) self.gan.pretrain(pretrain_samples) self.bk = {'dis_log_loss': [], 'gen_log_loss': [], 'episodes': []}
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, step_size, max_reward_thr, min_reward_thr, initial_dist=None, boundary_sampling_p=0.5, queue_len=10, scale_reward=False): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.nb_dims = len(self.mins) # Boundary sampling probability p_r self.bound_sampling_p = boundary_sampling_p # ADR step size self.step_size = step_size # Max reward threshold, sampling distribution inflates if mean reward above this self.max_reward_threshold = max_reward_thr if scale_reward: self.max_reward_threshold = np.interp(self.max_reward_threshold, (self.env_reward_lb, self.env_reward_ub), (0, 1)) # Min reward threshold, sampling distribution deflates if mean reward below this self.min_reward_threshold = min_reward_thr if scale_reward: self.min_reward_threshold = np.interp(self.min_reward_threshold, (self.env_reward_lb, self.env_reward_ub), (0, 1)) # max queue length self.window_len = queue_len # Set initial task space to predefined calibrated task initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True) # self.cur_mins = [] # self.cur_maxs = [] # for i in range(len(mins)): # 10% of each dimension # current_min = initial_mean[i] - 0.05*(maxs[i]-mins[i]) # mean - 5% of the dimension # current_max = initial_mean[i] + 0.05*(maxs[i]-mins[i]) # mean + 5% of the dimension # self.cur_mins.append(max(mins[i], current_min)) # self.cur_maxs.append(min(maxs[i], current_max)) # Single task version (as the original paper) self.cur_mins = initial_mean self.cur_maxs = initial_mean self.cur_mins = np.array(self.cur_mins, dtype=np.float32) # current min bounds self.cur_maxs = np.array(self.cur_maxs, dtype=np.float32) # current max bounds self.task_space = Box(self.cur_mins, self.cur_maxs, dtype=np.float32) self.task_space.seed(self.seed) # Init queues, one per task space dimension self.min_queues = [deque(maxlen=self.window_len) for _ in range(self.nb_dims)] self.max_queues = [deque(maxlen=self.window_len) for _ in range(self.nb_dims)] # Boring book-keeping self.episode_nb = 0 self.bk = {'task_space': [(self.cur_mins.copy(),self.cur_maxs.copy())], 'episodes': []}
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub): ''' Random teacher sampling tasks uniformly random over the task space. ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32) self.random_task_generator.seed(self.seed)
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, update_frequency=100, setter_loss_noise_ub=0.01, setter_hidden_size=128): ''' Setter-Solver (https://arxiv.org/abs/1909.12892). Made with the help of https://drive.google.com/drive/folders/1yjhztFeX67tHEImXCiP_UAQfQ-wFvV4Y. Args: update_frequency: How many episodes between two updates of the neural networks setter_loss_noise_ub: Upper bound of the noise added to tasks in the Setter's loss setter_hidden_size: Number of neurons in the Setter's layers ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.nb_dims = len(self.mins) tf.set_random_seed(seed) tf_config = tf.ConfigProto() # Prevent tensorflow from taking all the gpu memory tf_config.gpu_options.allow_growth = True self.tf_session = tf.Session(config=tf_config) self.update_frequency = update_frequency self.episode_counter = 0 self.judge = Judge(hidden_sizes=[64, 64, 64], tf_session=self.tf_session, goal_size=self.nb_dims) self.setter = FlatRnvp(latent_size=self.nb_dims, num_blocks=3, num_layers_per_block=3, tf_session=self.tf_session, judge_output_op=self.judge._mlp, hidden_size=setter_hidden_size, final_non_linearity=ClippedSigmoid, loss_noise_ub=setter_loss_noise_ub, random_state=self.random_state) self.tf_session.run(tf.global_variables_initializer()) self.goal_buffer = GoalBuffer() self.pending_goals = [] self.bk = {'judge_loss': [], 'setter_loss': [], 'episodes': []}
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, absolute_lp=False, fit_rate=250, potential_ks=np.arange(2, 11, 1), random_task_ratio=0.2, nb_bootstrap=None, initial_dist=None): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) # Range of number of Gaussians to try when fitting the GMM self.potential_ks = potential_ks # Ratio of randomly sampled tasks VS tasks sampling using GMM self.random_task_ratio = random_task_ratio self.random_task_generator = Box(self.mins, self.maxs, dtype=np.float32) self.random_task_generator.seed(self.seed) # Number of episodes between two fit of the GMM self.fit_rate = fit_rate self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate # Number of bootstrapping episodes, must be >= to fit_rate self.initial_dist = initial_dist # Initial Gaussian distribution. If None, bootstrap with random tasks # Original version does not use Absolute LP, only LP. self.absolute_lp = absolute_lp self.tasks = [] self.tasks_times_rewards = [] self.all_times = np.arange(0, 1, 1 / self.fit_rate) self.gmm = None # boring book-keeping self.bk = { 'weights': [], 'covariances': [], 'means': [], 'tasks_lps': [], 'episodes': [], 'tasks_origin': [] }
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, update_frequency=100, setter_loss_noise_ub=0.01, setter_hidden_size=128): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.nb_dims = len(self.mins) tf.set_random_seed(seed) tf_config = tf.ConfigProto() # Prevent tensorflow from taking all the gpu memory tf_config.gpu_options.allow_growth = True self.tf_session = tf.Session(config=tf_config) self.update_frequency = update_frequency self.episode_counter = 0 self.judge = Judge(hidden_sizes=[64, 64, 64], tf_session=self.tf_session, goal_size=self.nb_dims) self.setter = FlatRnvp(latent_size=self.nb_dims, num_blocks=3, num_layers_per_block=3, tf_session=self.tf_session, judge_output_op=self.judge._mlp, hidden_size=setter_hidden_size, final_non_linearity=ClippedSigmoid, loss_noise_ub=setter_loss_noise_ub, random_state=self.random_state) self.tf_session.run(tf.global_variables_initializer()) self.goal_buffer = GoalBuffer() self.pending_goals = [] self.bk = {'judge_loss': [], 'setter_loss': [], 'episodes': []}
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, max_region_size=200, alp_window_size=None, nb_split_attempts=50, sampling_in_leaves_only=False, min_region_size=None, min_dims_range_ratio=1 / 6, discard_ratio=1 / 4): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) # Maximal number of (task, reward) pairs a region can hold before splitting self.maxlen = max_region_size self.alp_window = self.maxlen if alp_window_size is None else alp_window_size # Initialize Regions' tree self.tree = Tree() self.regions_bounds = [Box(self.mins, self.maxs, dtype=np.float32)] self.regions_alp = [0.] self.tree.create_node('root', 'root', data=Region(maxlen=self.maxlen, r_t_pairs=[ deque(maxlen=self.maxlen + 1), deque(maxlen=self.maxlen + 1) ], bounds=self.regions_bounds[-1], alp=self.regions_alp[-1])) self.nb_dims = len(mins) self.nb_split_attempts = nb_split_attempts # Whether task sampling uses parent and child regions (False) or only child regions (True) self.sampling_in_leaves_only = sampling_in_leaves_only # Additional tricks to original RIAC, enforcing splitting rules # 1 - Minimum population required for both children when splitting --> set to 1 to cancel self.minlen = self.maxlen / 20 if min_region_size is None else min_region_size # 2 - minimum children region size (compared to initial range of each dimension) # Set min_dims_range_ratio to 1/np.inf to cancel self.dims_ranges = self.maxs - self.mins self.min_dims_range_ratio = min_dims_range_ratio # 3 - If after nb_split_attempts, no split is valid, flush oldest points of parent region # If 1- and 2- are canceled, this will be canceled since any split will be valid self.discard_ratio = discard_ratio # book-keeping self.sampled_tasks = [] self.all_boxes = [] self.all_alps = [] self.update_nb = -1 self.split_iterations = [] self.hyperparams = locals()
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, step_size, max_reward_thr, min_reward_thr, initial_dist=None, boundary_sampling_p=0.5, queue_len=10, scale_reward=False): ''' Automatic Domain Randomization (https://arxiv.org/abs/1910.07113). Args: step_size: Size of the growth (or decrease) of a bound at update max_reward_thr: Upper reward threshold used to inflate distribution min_reward_thr: Lowers reward threshold used to deflate distribution initial_dist: The mean of this initial distribution is used as the initial task used by ADR boundary_sampling_p: Probability to sample a dimension at a bound queue_len: Size of the queue associated to each bound. Once reached, ADR increases or decreases the bound. ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.nb_dims = len(self.mins) # Boundary sampling probability p_r self.bound_sampling_p = boundary_sampling_p # ADR step size self.step_size = step_size # Max reward threshold, sampling distribution inflates if mean reward above this self.max_reward_threshold = max_reward_thr if scale_reward: self.max_reward_threshold = np.interp( self.max_reward_threshold, (self.env_reward_lb, self.env_reward_ub), (0, 1)) # Min reward threshold, sampling distribution deflates if mean reward below this self.min_reward_threshold = min_reward_thr if scale_reward: self.min_reward_threshold = np.interp( self.min_reward_threshold, (self.env_reward_lb, self.env_reward_ub), (0, 1)) # max queue length self.window_len = queue_len # Set initial task space to predefined calibrated task initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True) # Single task version (as the original paper) self.cur_mins = initial_mean self.cur_maxs = initial_mean self.cur_mins = np.array(self.cur_mins, dtype=np.float32) # current min bounds self.cur_maxs = np.array(self.cur_maxs, dtype=np.float32) # current max bounds self.task_space = Box(self.cur_mins, self.cur_maxs, dtype=np.float32) self.task_space.seed(self.seed) # Init queues, one per task space dimension self.min_queues = [ deque(maxlen=self.window_len) for _ in range(self.nb_dims) ] self.max_queues = [ deque(maxlen=self.window_len) for _ in range(self.nb_dims) ] # Boring book-keeping self.episode_nb = 0 self.bk = { 'task_space': [(self.cur_mins.copy(), self.cur_maxs.copy())], 'episodes': [] }
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, update_frequency, update_offset, alpha_function, initial_dist=None, target_dist=None, max_kl=0.1, std_lower_bound=None, kl_threshold=None, cg_parameters=None, use_avg_performance=False, max_context_buffer_size=1000, reset_contexts=True, discount_factor=0.99): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) torch.manual_seed(self.seed) initial_mean, initial_variance = self.get_or_create_dist(initial_dist, mins, maxs, subspace=True) # Random subspace of the task space if no intial dist target_mean, target_variance = self.get_or_create_dist(target_dist, mins, maxs, subspace=False) # Full task space if no intial dist context_bounds = (np.array(mins), np.array(maxs)) self.update_frequency = update_frequency self.update_offset = update_offset self.step_counter = 0 self.discounted_sum_reward = 0 self.discount_factor = discount_factor self.discounted_sum_rewards = [] self.current_disc = 1 self.pending_initial_state = None self.algorithm_iterations = 0 # The bounds that we show to the outside are limited to the interval [-1, 1], as this is typically better for # neural nets to deal with self.context_buffer = Buffer(2, max_context_buffer_size, reset_contexts) self.context_dim = target_mean.shape[0] self.context_bounds = context_bounds self.use_avg_performance = use_avg_performance if std_lower_bound is not None and kl_threshold is None: raise RuntimeError("Error! Both Lower Bound on standard deviation and kl threshold need to be set") else: if std_lower_bound is not None: if isinstance(std_lower_bound, np.ndarray): if std_lower_bound.shape[0] != self.context_dim: raise RuntimeError("Error! Wrong dimension of the standard deviation lower bound") elif std_lower_bound is not None: std_lower_bound = np.ones(self.context_dim) * std_lower_bound self.std_lower_bound = std_lower_bound self.kl_threshold = kl_threshold # Create the initial context distribution if isinstance(initial_variance, np.ndarray): flat_init_chol = GaussianTorchDistribution.flatten_matrix(initial_variance, tril=False) else: flat_init_chol = GaussianTorchDistribution.flatten_matrix(initial_variance * np.eye(self.context_dim), tril=False) # Create the target distribution if isinstance(target_variance, np.ndarray): flat_target_chol = GaussianTorchDistribution.flatten_matrix(target_variance, tril=False) else: flat_target_chol = GaussianTorchDistribution.flatten_matrix(target_variance * np.eye(self.context_dim), tril=False) AbstractSelfPacedTeacher.__init__(self, initial_mean, flat_init_chol, target_mean, flat_target_chol, alpha_function, max_kl, cg_parameters) self.bk = {'mean': [], 'covariance': [], 'steps': [], 'algo_iterations': [], 'kl': []}
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub): AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) self.random_task_generator = Box(np.array(mins), np.array(maxs), dtype=np.float32) self.random_task_generator.seed(self.seed)
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, update_frequency, update_offset, alpha_function, initial_dist=None, target_dist=None, max_kl=0.1, std_lower_bound=None, kl_threshold=None, cg_parameters=None, use_avg_performance=False, max_context_buffer_size=1000, reset_contexts=True, discount_factor=0.99): ''' Self-paced Deep Reinforcement Learning (https://papers.nips.cc/paper/2020/hash/68a9750337a418a86fe06c1991a1d64c-Abstract.html). Taken from https://github.com/psclklnk/spdl and wrapped to our architecture. Works in a non-episodic setup, updates are thus made in the `step_update` method. Args: update_frequency: Update frequency of the sampling distribution (in steps) update_offset: How many steps must be done before the starting to update the distribution alpha_function: Function calculating the alpha parameter initial_dist: Initial distribution to start from target_dist: Target distribution to reach max_kl: Maximum KL-divergence authorized between the old and new distributions when updating std_lower_bound: Minimum std authorized on the sampling distribution if the KL-divergence between the latter and the target distribution is greater than `kl_threshold`. Set this to `None` if no constraint on the std must be applied kl_threshold: Threshold enforcing the std constraint cg_parameters: Additional parameters for the Conjugate Gradient method use_avg_performance: Whether the alpha function must used the averaged performance max_context_buffer_size: Maximum size of the buffer storing sampled tasks reset_contexts: Whether the buffer should be reset when queried discount_factor: Discount factor used in the Universal Value Function ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) torch.manual_seed(self.seed) initial_mean, initial_variance = self.get_or_create_dist( initial_dist, mins, maxs, subspace=True ) # Random subspace of the task space if no intial dist target_mean, target_variance = self.get_or_create_dist( target_dist, mins, maxs, subspace=False) # Full task space if no intial dist context_bounds = (np.array(mins), np.array(maxs)) self.update_frequency = update_frequency self.update_offset = update_offset self.step_counter = 0 self.discounted_sum_reward = 0 self.discount_factor = discount_factor self.discounted_sum_rewards = [] self.current_disc = 1 self.pending_initial_state = None self.algorithm_iterations = 0 # The bounds that we show to the outside are limited to the interval [-1, 1], as this is typically better for # neural nets to deal with self.context_buffer = Buffer(2, max_context_buffer_size, reset_contexts) self.context_dim = target_mean.shape[0] self.context_bounds = context_bounds self.use_avg_performance = use_avg_performance if std_lower_bound is not None and kl_threshold is None: raise RuntimeError( "Error! Both Lower Bound on standard deviation and kl threshold need to be set" ) else: if std_lower_bound is not None: if isinstance(std_lower_bound, np.ndarray): if std_lower_bound.shape[0] != self.context_dim: raise RuntimeError( "Error! Wrong dimension of the standard deviation lower bound" ) elif std_lower_bound is not None: std_lower_bound = np.ones( self.context_dim) * std_lower_bound self.std_lower_bound = std_lower_bound self.kl_threshold = kl_threshold # Create the initial context distribution if isinstance(initial_variance, np.ndarray): flat_init_chol = GaussianTorchDistribution.flatten_matrix( initial_variance, tril=False) else: flat_init_chol = GaussianTorchDistribution.flatten_matrix( initial_variance * np.eye(self.context_dim), tril=False) # Create the target distribution if isinstance(target_variance, np.ndarray): flat_target_chol = GaussianTorchDistribution.flatten_matrix( target_variance, tril=False) else: flat_target_chol = GaussianTorchDistribution.flatten_matrix( target_variance * np.eye(self.context_dim), tril=False) AbstractSelfPacedTeacher.__init__(self, initial_mean, flat_init_chol, target_mean, flat_target_chol, alpha_function, max_kl, cg_parameters) self.bk = { 'mean': [], 'covariance': [], 'steps': [], 'algo_iterations': [], 'kl': [] }
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, gmm_fitness_func="aic", warm_start=False, nb_em_init=1, fit_rate=250, alp_max_size=None, alp_buffer_size=500, potential_ks=np.arange(2, 11, 1), random_task_ratio=0.2, nb_bootstrap=None, initial_dist=None): ''' Absolute Learning Progress - Gaussian Mixture Model (https://arxiv.org/abs/1910.07224). Args: gmm_fitness_func: Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians. warm_start: Restart new fit by initializing with last fit nb_em_init: Number of Expectation-Maximization trials when fitting fit_rate: Number of episodes between two fit of the GMM alp_max_size: Maximum number of episodes stored alp_buffer_size: Maximal number of episodes to account for when computing ALP potential_ks: Range of number of Gaussians to try when fitting the GMM random_task_ratio: Ratio of randomly sampled tasks VS tasks sampling using GMM nb_bootstrap: Number of bootstrapping episodes, must be >= to fit_rate initial_dist: Initial Gaussian distribution. If None, bootstrap with random tasks ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) # Range of number of Gaussians to try when fitting the GMM self.potential_ks = potential_ks # Restart new fit by initializing with last fit self.warm_start = warm_start # Fitness criterion when selecting best GMM among range of GMMs varying in number of Gaussians. self.gmm_fitness_func = gmm_fitness_func # Number of Expectation-Maximization trials when fitting self.nb_em_init = nb_em_init # Number of episodes between two fit of the GMM self.fit_rate = fit_rate self.nb_bootstrap = nb_bootstrap if nb_bootstrap is not None else fit_rate # Number of bootstrapping episodes, must be >= to fit_rate self.initial_dist = initial_dist # Initial Gaussian distribution. If None, bootstrap with random tasks # Ratio of randomly sampled tasks VS tasks sampling using GMM self.random_task_ratio = random_task_ratio self.random_task_generator = Box(self.mins, self.maxs, dtype=np.float32) self.random_task_generator.seed(self.seed) # Maximal number of episodes to account for when computing ALP alp_max_size = alp_max_size alp_buffer_size = alp_buffer_size # Init ALP computer self.alp_computer = EmpiricalALPComputer(len(mins), max_size=alp_max_size, buffer_size=alp_buffer_size) self.tasks = [] self.alps = [] self.tasks_alps = [] # Init GMMs self.potential_gmms = [self.init_gmm(k) for k in self.potential_ks] self.gmm = None # Boring book-keeping self.bk = { 'weights': [], 'covariances': [], 'means': [], 'tasks_alps': [], 'tasks_lps': [], 'episodes': [], 'tasks_origin': [] }
def __init__(self, mins, maxs, seed, env_reward_lb, env_reward_ub, state_noise_level, success_distance_threshold, update_size, n_rollouts=2, goid_lb=0.25, goid_ub=0.75, p_old=0.2, use_pretrained_samples=False, initial_dist=None): ''' GoalGAN (http://proceedings.mlr.press/v80/florensa18a.html). Code taken from https://github.com/psclklnk/spdl + minor updates. Args: state_noise_level: Proportion of noise added to goal sampled by the GAN (used on each dimension) success_distance_threshold: How far a 'Goal Of Intermediate Difficulty' must be from the others to be added to the buffer (expressed percentage of each dimension) update_size: How many new goals must be sampled before training the GAN n_rollouts: How many times a goal must be proposed to the student before calculating its mean success goid_lb: Lower bound to consider the mean success of a goal of 'Intermediate Difficulty' goid_ub: Upper bound to consider the mean success of a goal of 'Intermediate Difficulty' p_old: Probability to sample a goal from the buffer of old goals use_pretrained_samples: Whether the GAN should be pretrained using samples from `initial_dist` initial_dist Initial distribution from which samples should be generated to pretrain the GAN ''' AbstractTeacher.__init__(self, mins, maxs, env_reward_lb, env_reward_ub, seed) np.random.seed(self.seed) # To seed the GAN (sufficient ?) tf.set_random_seed(seed) tf_config = tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) # Prevent tensorflow from taking all the gpu memory tf_config.gpu_options.allow_growth = True self.tf_session = tf.Session(config=tf_config) self.gan = StateGAN( state_size=len(mins), evaluater_size=1, state_range=0.5 * (self.maxs - self.mins) + 1e-6, # avoid normalization issues for dimensions where min==max state_center=mins + 0.5 * (self.maxs - self.mins), state_noise_level=(state_noise_level * (self.maxs - self.mins))[None, :], generator_layers=[256, 256], discriminator_layers=[128, 128], noise_size=self.mins.shape[0], tf_session=self.tf_session, configs={"supress_all_logging": True}) self.tf_session.run(tf.initialize_local_variables()) self.replay_noise = state_noise_level * (self.maxs - self.mins) self.success_buffer = StateCollection( 1, success_distance_threshold * np.linalg.norm(self.maxs - self.mins)) self.update_size = update_size self.contexts = [] self.labels = [] self.p_old = p_old self.n_rollouts = n_rollouts self.goid_lb = goid_lb self.goid_ub = goid_ub self.pending_contexts = {} self.context_queue = Queue() self.episode_counter = 0 if use_pretrained_samples: print("Pretraining GAN...") initial_mean, initial_variance = self.get_or_create_dist( initial_dist, mins, maxs, subspace=True) pretrain_samples = self.random_state.multivariate_normal( initial_mean, initial_variance, size=1000) pretrain_samples = np.clip(pretrain_samples, mins, maxs, dtype=np.float32) self.gan.pretrain(pretrain_samples) self.bk = {'dis_log_loss': [], 'gen_log_loss': [], 'episodes': []}