def check_constraint(self, new_agent): # Compute UBM, extract supervectors and compute KL new_policy_data = do_manual_rollouts(new_agent, self.env, self.n_rollouts) new_policy_data += np.random.randn(*new_policy_data.shape) * 0.001 all_data = np.concatenate((self.old_policy_data, new_policy_data), axis=0) # Avoid all the spam from "less unique centroids" with warnings.catch_warnings(): warnings.simplefilter("ignore") ubm = gmm_tools.train_ubm(all_data, n_components=self.n_centroids, verbose=0) old_supervector = gmm_tools.trajectories_to_supervector( self.old_policy_data, ubm) new_supervector = gmm_tools.trajectories_to_supervector( new_policy_data, ubm) # Supervectors are returned as raveled 1D vectors old_supervector = old_supervector.reshape((ubm.means_.shape)) new_supervector = new_supervector.reshape((ubm.means_.shape)) kl_distance = gmm_tools.adapted_gmm_distance(old_supervector, new_supervector, ubm.precisions_, ubm.weights_) if kl_distance >= self.max_kl_constraint: return True return False
def extract_pivector_worker(num_traj_index, num_traj, num_components, env): # Worker for the function below trained_ubms = glob(UBM_TEMPLATE.format(num_traj=num_traj, num_components=num_components, env=env, policy_name="*", repetition_num="*")) trained_ubm_dirs = [os.path.basename(os.path.dirname(x)) for x in trained_ubms] policy_names = ["_".join(x.split("_")[-4:-2]) for x in trained_ubm_dirs] policy_names = sorted(list(set(policy_names))) for policy_name in policy_names: for repetition in range(1, NUM_REPETITIONS + 1): pivector_path = PIVECTOR_TEMPLATE.format(num_traj=num_traj, num_components=num_components, env=env, policy_name=policy_name, repetition_num=repetition) # If already exists, skip extracting pivectors for this if os.path.isfile(pivector_path): continue # Load UBM ubm_path = UBM_TEMPLATE.format(num_traj=num_traj, num_components=num_components, env=env, policy_name=policy_name, repetition_num=repetition) ubm, means, stds = load_ubm(ubm_path) # Hacky thing to load the same trajectories as used in UBM training ubm_data = np.load(ubm_path) trajectory_indeces = ubm_data["trajectory_indeces"] ubm_data.close() # Load trajectory data trajectories_path = glob(os.path.join(TRAJECTORY_TEMPLATE.format(env=env, policy_name=policy_name), "*")) trajectories_path = sorted(trajectories_path) all_pivectors = [] all_average_episodic_returns = [] for trajectory_i, trajectory_path in enumerate(trajectories_path): data = np.load(trajectory_path) keys = sorted(list(data.keys())) all_average_episodic_returns.append(data["episodic_rewards"].mean()) # Take trajectories at same indeces as in used in training UBM. # First make sure it is in same order as with ubm training datas = [data[key] for key in keys if "traj" in key] datas = [datas[i] for i in trajectory_indeces[trajectory_i]] data = np.concatenate(datas, axis=0) data = (data - means) / stds pivector = trajectories_to_supervector(data, ubm) all_pivectors.append(pivector) all_pivectors = np.array(all_pivectors) np.savez( pivector_path, pivectors=all_pivectors, average_episodic_rewards=all_average_episodic_returns, covariances=ubm.covariances_, weights=ubm.weights_, )
def extract_pivectors(unparsed_args): parser = ArgumentParser("Extract pivectors for given experiments") parser.add_argument( "--inputs", type=str, nargs="+", required=True, help="Paths to experiments for which pivectors should be extracted.") parser.add_argument( "ubms", type=str, help="Directory where UBM models reside, one per environment.") args = parser.parse_args(unparsed_args) for experiment_path in tqdm(args.inputs): env = experiment_path.split("_")[1] os.makedirs(os.path.join(experiment_path, PIVECTORS_DIR), exist_ok=True) ubm, means, stds = load_ubm( os.path.join(args.ubms, "{}_ubm.npz".format(env))) trajectory_paths = glob.glob( os.path.join(experiment_path, TRAJECTORIES_DIR, "*")) for trajectory_path in tqdm(trajectory_paths, leave=False): trajectory_name = os.path.basename(trajectory_path) data = np.load(trajectory_path) average_episodic_reward = data["episodic_rewards"].mean() states = np.concatenate( [data[key] for key in data.keys() if "traj" in key]) # Normalize states = (states - means) / stds pivector = trajectories_to_supervector(states, ubm) new_path = os.path.join(experiment_path, PIVECTORS_DIR, trajectory_name) # Also store component weights and covariances for future reference np.savez(new_path, pivector=pivector, average_episodic_reward=average_episodic_reward, covariances=ubm.covariances_, weights=ubm.weights_) del ubm
def train_ubm_and_extract_pivectors(env, experiment_paths): """ Train UBM for pivector extraction and adapt GMMs for the given experiments """ ubm_path = UBM_PATH.format(env) os.makedirs(os.path.dirname(ubm_path), exist_ok=True) # Train UBM if one does not exist if not os.path.isfile(ubm_path): # Load GAIL and BC data, and final agent data as well all_data = [] for experiment_path in tqdm(experiment_paths, desc="ubm-load"): traj_paths = glob( os.path.join(experiment_path, BC_TRAJECTORY_DIRECTORY, "*")) for traj_path in traj_paths: data = np.load(traj_path) data_trajs = [ data[key] for key in data.keys() if "traj" in key ] all_data.extend(data_trajs) # Load the data of the final model traj_paths = os.path.join(experiment_path, FINAL_MODEL_TRAJECTORIES) data = np.load(traj_path) data_trajs = [data[key] for key in data.keys() if "traj" in key] all_data.extend(data_trajs) all_data = np.concatenate(all_data, axis=0) # Restrict amount of data if all_data.shape[0] > MAX_UBM_DATA: np.random.shuffle(all_data) all_data = all_data[:MAX_UBM_DATA] # Normalize means = all_data.mean(axis=0) stds = all_data.std(axis=0) all_data = (all_data - means) / stds ubm = train_ubm(all_data, n_components=NUM_COMPONENTS) save_ubm(ubm_path, ubm, means, stds) else: print("Skipping UBM training (found)") ubm, means, std = load_ubm(ubm_path) # Extract pivectors for experiment_path in experiment_paths: traj_dir = BC_TRAJECTORY_DIRECTORY pivec_dir = BC_PIVECTOR_DIRECTORY os.makedirs(os.path.join(experiment_path, pivec_dir), exist_ok=True) traj_paths = glob(os.path.join(experiment_path, traj_dir, "*")) for traj_path in traj_paths: pivec_path = os.path.join(experiment_path, pivec_dir, os.path.basename(traj_path)) if os.path.isfile(pivec_path): continue data = np.load(traj_path) average_episodic_reward = data["episodic_rewards"].mean() data = [data[key] for key in data.keys() if "traj" in key] data = np.concatenate(data, axis=0) data = (data - means) / std pivec = trajectories_to_supervector(data, ubm) # Also store component weights and covariances for future reference np.savez(pivec_path, pivector=pivec, average_episodic_reward=average_episodic_reward, covariances=ubm.covariances_, weights=ubm.weights_) # Extract pivector for the final model as well pivec_path = os.path.join(experiment_path, FINAL_MODEL_PIVECTOR) traj_path = os.path.join(experiment_path, FINAL_MODEL_TRAJECTORIES) if not os.path.isfile(pivec_path): data = np.load(traj_path) average_episodic_reward = data["episodic_rewards"].mean() data = [data[key] for key in data.keys() if "traj" in key] data = np.concatenate(data, axis=0) data = (data - means) / std pivec = trajectories_to_supervector(data, ubm) # Also store component weights and covariances for future reference np.savez(pivec_path, pivector=pivec, average_episodic_reward=average_episodic_reward, covariances=ubm.covariances_, weights=ubm.weights_)
def compute_novelty_vs_archive(archive, novelty_vector, k, bc_type="terminal", worker_dir=None): distances = [] nov = novelty_vector.astype(np.float) if bc_type == "supervector": ubm = None means = None stds = None # A fight against race-condition: If failed to load, try again bit later while ubm is None: try: ubm, means, stds = gmm_tools.load_ubm( os.path.join(worker_dir, NOVELTY_ARCHIVE_FILE_NAME)) except Exception: print("[Warning] Failed to load UBM file. Trying again...") time.sleep(0.1) # Normalize data normalized_states = (novelty_vector - means) / stds my_supervector = gmm_tools.trajectories_to_supervector( normalized_states, ubm) my_supervector = my_supervector.reshape(ubm.means_.shape) precisions = ubm.precisions_ weights = ubm.weights_ # Now load supervectors that are stored in the same file (conveniently reading many times # for _optimal efficiency_...) archive_data = None while archive_data is None: try: archive_data = np.load( os.path.join(worker_dir, NOVELTY_ARCHIVE_FILE_NAME)) except Exception: print("[Warning] Failed to load archive file. Trying again...") time.sleep(0.1) other_supervectors = archive_data["supervectors"] archive_data.close() for i in range(other_supervectors.shape[0]): kl_distance = gmm_tools.adapted_gmm_distance( my_supervector, other_supervectors[i], precisions, weights) distances.append(kl_distance) else: for point in archive: if bc_type == "terminal": distances.append( euclidean_distance(point.astype(np.float), nov)) elif bc_type == "gaussian": midpoint = len(point) // 2 if isinstance(nov, np.ndarray): if nov.ndim == 2: # Need to compute mean and cov nov = th.distributions.MultivariateNormal( th.from_numpy(np.mean(nov, axis=0)).float(), th.diag(th.from_numpy(np.var(nov, axis=0) + 1e-7)).float()) else: # Already computed mean+var vector nov = th.distributions.MultivariateNormal( th.from_numpy(nov[:midpoint]).float(), th.diag(th.from_numpy(nov[midpoint:] + 1e-7)).float()) point = th.distributions.MultivariateNormal( th.from_numpy(point[:midpoint]).float(), th.diag(th.from_numpy(point[midpoint:] + 1e-7)).float()) with th.no_grad(): kl_distance = (th.distributions.kl_divergence(nov, point) + th.distributions.kl_divergence(point, nov)) distances.append(kl_distance.item()) else: raise NotImplementedError( "bc_type {} not implemented".format(bc_type)) # Pick k nearest neighbors distances = np.array(distances) top_k_indicies = (distances).argsort()[:k] top_k = distances[top_k_indicies] return top_k.mean()
def get_mean_bc(env, policy, tslimit, num_rollouts=1, bc_type="terminal", for_archive=False, worker_dir=None): # for_archive tells us if we are about to send this item to arhive. novelty_vector = [] for n in range(num_rollouts): rew, t, nv = policy.rollout( env, timestep_limit=tslimit, bc_only_final_state=(bc_type == "terminal")) novelty_vector.append(nv) if bc_type == "terminal": return np.mean(novelty_vector, axis=0) elif bc_type == "gaussian": # Concatenate individual rollouts novelty_vector = np.concatenate(novelty_vector, axis=0) # Fit a simple monogaussian return np.concatenate(( np.mean(novelty_vector, axis=0), np.var(novelty_vector, axis=0), ), axis=0) elif bc_type == "supervector": novelty_vector = np.concatenate(novelty_vector, axis=0) if for_archive: # Save novelty vector for the archival purposes raw_data_dirpath = os.path.join(worker_dir, NOVELTY_RAW_DATA_DIR_NAME) os.makedirs(raw_data_dirpath, exist_ok=True) # Store data and update the extractor num_items = len(os.listdir(raw_data_dirpath)) # Create a new item save_path = os.path.join(raw_data_dirpath, "policy_{}".format(num_items)) np.save(save_path, novelty_vector) # Read stored policies' data, train UBM and extract supervectors all_buffer_data = [] buffer_filenames = os.listdir(raw_data_dirpath) for filename in buffer_filenames: full_path = os.path.join(raw_data_dirpath, filename) buffer_data = np.load(full_path) all_buffer_data.append(buffer_data) # Super-elegant hoarding of memory by having multiple copies concat_data = np.concatenate(all_buffer_data, axis=0) means = concat_data.mean(axis=0) stds = concat_data.std(axis=0) concat_data = (concat_data - means) / stds ubm = gmm_tools.train_ubm( concat_data, n_components=NOVELTY_SUPERVECTOR_COMPONENTS, verbose=0) # Extract supervectors supervectors = [] mean_shape = ubm.means_.shape for buffer_data in all_buffer_data: buffer_data = (buffer_data - means) / stds supervector = gmm_tools.trajectories_to_supervector( buffer_data, ubm) supervectors.append(supervector.reshape(mean_shape)) gmm_tools.save_ubm(os.path.join(worker_dir, NOVELTY_ARCHIVE_FILE_NAME), ubm, means, stds, supervectors=supervectors) # Return something dumb. This goes to archive, but we are # never supposed to read it during supervector novelty search return np.array([42]) else: # Return just the states visited. They are the "BC" return novelty_vector else: raise NotImplementedError("bc_type {} not implemented".format(bc_type))
def main(): # Illustrations for three different agents: # - Random # - a trained neural network # - Always high env = gym.make("Pendulum-v0") env = StateWrapper(env) # Always pick random action random_agent = SimpleAgentClass(lambda obs: env.action_space.sample()) # Always pick high always_high_agent = SimpleAgentClass(lambda obs: env.action_space.high) # Trained stable-baselines agent def network_activation(obs): x = np.tanh((obs @ NETWORK_W1) + NETWORK_B1) x = (x @ NETWORK_W2) + NETWORK_B2 return x network_agent = SimpleAgentClass(network_activation) # Gather observations print("Collecting random trajectories...") random_trajectories, random_rewards = collect_trajectories( env, random_agent, NUM_TRAJECTORIES) print("Average reward: {}".format(np.mean(random_rewards))) print("Collecting always-high trajectories...") always_high_trajectories, always_high_rewards = collect_trajectories( env, always_high_agent, NUM_TRAJECTORIES) print("Average reward: {}".format(np.mean(always_high_rewards))) print("Collecting network trajectories...") network_trajectories, network_rewards = collect_trajectories( env, network_agent, NUM_TRAJECTORIES) print("Average reward: {}".format(np.mean(network_rewards))) random_trajectories = np.concatenate(random_trajectories, axis=0) always_high_trajectories = np.concatenate(always_high_trajectories, axis=0) network_trajectories = np.concatenate(network_trajectories, axis=0) # Take theta and theta-velocity as the variables we want to study (for 2D plots) random_trajectories = np.stack( (np.arccos(random_trajectories[:, 0]), random_trajectories[:, 2]), axis=1) always_high_trajectories = np.stack((np.arccos( always_high_trajectories[:, 0]), always_high_trajectories[:, 2]), axis=1) network_trajectories = np.stack( (np.arccos(network_trajectories[:, 0]), network_trajectories[:, 2]), axis=1) all_data = np.concatenate( (random_trajectories, always_high_trajectories, network_trajectories), axis=0) # Train the GMM-UBM. # Using multiple inits here for a similar results on different runs ubm = train_ubm(all_data, n_components=N_COMPONENTS, n_init=5) # Extract policy supervectors (or adapted means) random_means = trajectories_to_supervector(random_trajectories, ubm).reshape(N_COMPONENTS, 2) always_high_means = trajectories_to_supervector(always_high_trajectories, ubm).reshape( N_COMPONENTS, 2) network_means = trajectories_to_supervector(network_trajectories, ubm).reshape(N_COMPONENTS, 2) # Compute stuff for contours mins, maxs = all_data.min(axis=0), all_data.max(axis=0) theta_space = np.linspace(mins[0], maxs[0], num=NUM_LINSPACE) thetavel_space = np.linspace(mins[1], maxs[1], num=NUM_LINSPACE) locations = np.array(np.meshgrid(theta_space, thetavel_space)).T.reshape(-1, 2) os.makedirs("figures", exist_ok=True) fig = pyplot.figure(figsize=FIG_SIZE) pyplot.axis("off") pyplot.scatter(all_data[:, 0], all_data[:, 1], alpha=0.003, s=MARKERSIZE) pyplot_remove_margins() pyplot.savefig("figures/method_all_data.png", **SAVEFIG_PARAMS) # Plot the components ax = pyplot.gca() for i in range(N_COMPONENTS): cov = np.diag(ubm.covariances_[i]) mean_x = ubm.means_[i, 0] mean_y = ubm.means_[i, 1] _ = confidence_ellipse(mean_x, mean_y, cov, ax, n_std=1, edgecolor="red", linewidth=2) pyplot.scatter(mean_x, mean_y, marker="+", c="red") pyplot_remove_margins() pyplot.savefig("figures/method_ubm_all_data.png", **SAVEFIG_PARAMS) xlim = pyplot.xlim() ylim = pyplot.ylim() pyplot.close(fig) # Repeat above for all different datas for i in range(3): name = None ubm_name = None means = None # Plot data fig = pyplot.figure(figsize=FIG_SIZE) if i == 0: name = "figures/method_random_data.png" ubm_name = "figures/method_ubm_random_data.png" means = random_means pyplot.scatter(random_trajectories[:, 0], random_trajectories[:, 1], alpha=0.003, s=MARKERSIZE) elif i == 1: name = "figures/method_always_high_data.png" ubm_name = "figures/method_ubm_always_high_data.png" means = always_high_means pyplot.scatter(always_high_trajectories[:, 0], always_high_trajectories[:, 1], alpha=0.003, s=MARKERSIZE) else: name = "figures/method_network_data.png" ubm_name = "figures/method_ubm_network_data.png" means = network_means pyplot.scatter(network_trajectories[:, 0], network_trajectories[:, 1], alpha=0.003, s=MARKERSIZE) pyplot.xlim(xlim) pyplot.ylim(ylim) pyplot.axis("off") pyplot_remove_margins() pyplot.savefig(name, **SAVEFIG_PARAMS) # Plot adapted GMM # Plot the old components and new compontnes ax = pyplot.gca() for i in range(N_COMPONENTS): cov = np.diag(ubm.covariances_[i]) mean_x = means[i, 0] mean_y = means[i, 1] _ = confidence_ellipse(mean_x, mean_y, cov, ax, n_std=1, edgecolor="red", linewidth=2) pyplot.scatter(mean_x, mean_y, marker="+", c="red") # Old component ubm_mean_x = ubm.means_[i, 0] ubm_mean_y = ubm.means_[i, 1] _ = confidence_ellipse(ubm_mean_x, ubm_mean_y, cov, ax, n_std=1, edgecolor="red", linewidth=2, alpha=0.3, linestyle="--") pyplot.scatter(ubm_mean_x, ubm_mean_y, marker="+", c="red", alpha=0.3) pyplot.arrow(ubm_mean_x, ubm_mean_y, mean_x - ubm_mean_x, mean_y - ubm_mean_y, color="red", width=0.01, linewidth=0.25) pyplot.xlim(xlim) pyplot.ylim(ylim) pyplot_remove_margins() pyplot.savefig(ubm_name, **SAVEFIG_PARAMS) pyplot.close(fig)