def compute_alice_reward(self, next_obs): alice_end_obs = next_obs if self.start_generation: bob_start_state = self._obs2start_transform(alice_end_obs) self.env_bob.update_start_generator( FixedStateGenerator(bob_start_state)) else: bob_goal_state = self._obs2goal_transform(alice_end_obs) self.env_bob.update_goal_generator( FixedStateGenerator(bob_goal_state)) path_bob = rollout( self.env_bob, self.policy_bob, max_path_length=max(5, self.max_path_length - self.time), # animated=False) t_alice = self.time t_bob = path_bob['rewards'].shape[0] reward = self.gamma * max( 0, self.alice_bonus + t_bob - self.alice_factor * t_alice) # print("t_bob: " + str(t_bob) + ", np.linalg.norm(bob_start_state): " + str(np.linalg.norm(bob_start_state))) # print("t_alice: " + str(t_alice), " speed: " + str(np.linalg.norm(bob_start_state) / t_alice)) # print("reward: " + str(reward)) return reward
def evaluate_state(state, env, policy, horizon, n_traj=1, full_path=False, key='rewards', as_goals=True, aggregator=(np.sum, np.mean)): aggregated_data = [] paths = [] if as_goals: env.update_goal_generator(FixedStateGenerator(state)) else: env.update_start_generator(FixedStateGenerator(state)) for j in range(n_traj): paths.append(rollout(env, policy, horizon)) if key in paths[-1]: aggregated_data.append(aggregator[0](paths[-1][key])) else: aggregated_data.append(aggregator[0](paths[-1]['env_infos'][key])) mean_reward = aggregator[1](aggregated_data) if full_path: return mean_reward, paths return mean_reward
def optimize(self, iter=0): # get paths n_starts = len(self.start_states) for itr in range(self.algo_alice.n_itr): paths_alice = [] paths_bob = [] new_start_states = [] for i in range(self.num_rollouts): self.env_alice.update_start_generator( FixedStateGenerator(self.start_states[i % n_starts])) paths_alice.append( rollout(self.env_alice, self.policy_alice, max_path_length=self.max_path_length, animated=False)) alice_end_obs = paths_alice[i]['observations'][-1] new_start_state = self.env_alice._obs2start_transform( alice_end_obs) new_start_states.append(new_start_state) self.env_bob.update_start_generator( FixedStateGenerator(new_start_state)) paths_bob.append( rollout(self.env_bob, self.policy_bob, max_path_length=self.max_path_length, animated=False)) # update rewards paths_alice, paths_bob = self.update_rewards( paths_alice=paths_alice, paths_bob=paths_bob, gamma=self.gamma) # optimize policies if self.optimize_alice: self.algo_alice.start_worker() self.algo_alice.init_opt() training_samples_alice = self.algo_alice.sampler.process_samples( itr=iter, paths=paths_alice) self.algo_alice.optimize_policy( itr=iter, samples_data=training_samples_alice) if self.optimize_bob: self.algo_bob.start_worker() self.algo_bob.init_opt() training_samples_bob = self.algo_bob.sampler.process_samples( itr=iter, paths=paths_bob) self.algo_bob.optimize_policy( itr=iter, samples_data=training_samples_bob) return np.array(new_start_states)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) inner_env = normalize(SwimmerMazeEnv(maze_size_scaling=3)) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=v['initial_brownian_horizon'], size=500, #TODO: increase to 2000 # size speeds up training a bit variance=v['brownian_variance'], subsample=v['num_new_starts'], ) # , animated=True, speedup=1) np.random.shuffle(seed_starts) # with env.set_kill_outside(): feasible_states = find_all_feasible_states_plotting( env, seed_starts, report, distance_threshold=0.2, brownian_variance=1, animate=True, limit=v['goal_range'], check_feasible=False, center=v['goal_center']) return
def ant_evaluate(env, policy, init_state=None, max_path_length=2000, animated=True, speedup=2): if init_state is not None: if len(init_state) == 2: init_state.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) # first two positions are COM env.update_start_generator(FixedStateGenerator(init_state)) path = rollout(env, policy, max_path_length=max_path_length, animated=animated, speedup=speedup) print("Trajectory length: {}".format(len(path["rewards"]))) print("Success: {}".format(path["rewards"][-1])) return path["rewards"][-1]
def benchmark_parallel(v): # test that prallel mapping actually speeds things up; helpful when >100 states fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) inner_env = normalize(AntMazeEnv()) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) for size in [100, 1000, 5000]: for parallel in [True, False]: starts = generate_starts( env, starts=[v['start_goal']], horizon=v['initial_brownian_horizon'], size=size, # size speeds up training a bit variance=v['brownian_variance'], # subsample=v['num_new_starts'], ) start = time.time() if parallel: processes = -1 else: processes = 1 filtered_starts = parallel_check_feasibility(env=env, starts=starts, max_path_length=100, n_processes=processes) print("Size: {} parallel: {} time: {}".format( size, parallel, time.time() - start))
def optimize_batch(self): # get paths n_starts = len(self.start_states) logger.log("N starts: " + str(n_starts)) all_alice_paths = [] self.algo_alice.current_itr = 0 with ExperimentLogger(self.log_dir, 'last_alice', snapshot_mode='last', hold_outter_log=True): logger.log("Training Alice") for i in range(n_starts): self.env_alice.update_start_generator( FixedStateGenerator(self.start_states[i % n_starts])) logger.log("Num itrs: " + str(self.algo_alice.n_itr)) alice_paths = self.algo_alice.train() all_alice_paths.extend(alice_paths) logger.log("All alice paths: " + str(len(all_alice_paths))) new_paths = [path for paths in all_alice_paths for path in paths] logger.log("New paths: " + str(len(new_paths))) new_start_states = [ self.env_alice._obs2start_transform(path['observations'][-1]) for path in new_paths ] t_alices = [path['rewards'].shape[0] for path in new_paths] logger.log("new start states: " + str(len(new_start_states))) logger.log("self.num_rollouts: " + str(self.num_rollouts)) new_start_states = np.array(new_start_states) if len(new_start_states) < self.num_rollouts: sampled_starts = new_start_states else: sampled_starts = new_start_states[np.random.choice( np.shape(new_start_states)[0], size=self.num_rollouts)] #return np.array(new_start_states[np.random.choice(new_start_states.shape[0], size=self.num_rollouts)]) return (sampled_starts, t_alices)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['constant_baseline']: logger.log("Using constant baseline") baseline = ConstantBaseline(env_spec=env.spec, value=1.0) else: logger.log("Using linear baseline") baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) # use goal for plot report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], horizon=v['brownian_horizon'], variance=v['brownian_variance']) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") #labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) start_classes, text_labels = convert_label(labels) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: # add a tone of noise if all the states I had ended up being high_reward! seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward seed_starts = all_starts.sample(300) # sample them from the replay else: seed_starts = generate_starts(env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts elif v['seed_with'] == 'on_policy': seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(SwimmerMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' # all_feasible_starts = pickle.load(open(osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states_min.pkl'), 'rb')) # print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) # brownian_starts = StateCollection(distance_threshold=v['regularize_starts']) # with env.set_kill_outside(): seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=v['initial_brownian_horizon'], size=5000, # size speeds up training a bit variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=1) np.random.shuffle(seed_starts) # with env.set_kill_outside(): feasible_states = find_all_feasible_states_plotting(env, seed_starts, distance_threshold=1, brownian_variance=1, animate=True) # print("hi") # show where these states are: # shuffled_starts = np.array(seed_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=seed_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10) if 'gae_lambda' not in v: v['gae_lambda'] = 1 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], size=5000, horizon=v['brownian_horizon'], variance=v['brownian_variance']) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], gae_lambda=v['gae_lambda'], step_size=0.01, discount=v['discount'], plot=False, ) algo.train() logger.log('Generating the Heatmap...') # policy means should not mean too much # plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=1, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) > 0: # add a tone of noise if all the states I had ended up being high_reward! seed_starts = filtered_raw_starts else: seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=5000, variance=v['brownian_variance'] * 10) all_starts.append(filtered_raw_starts)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) report.save() inner_env = normalize(Arm3dDiscEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-1 * v['goal_size']:], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'disc_all_feasible_states_min.pkl'), 'rb')) print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) # brownian_starts = StateCollection(distance_threshold=v['regularize_starts']) # with env.set_kill_outside(): # seed_starts = generate_starts(env, starts=[v['start_goal']], horizon=10, # this is smaller as they are seeds! # variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=1) # # with env.set_kill_outside(): # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False) # show where these states are: # shuffled_starts = np.array(all_feasible_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10) # Use asymmetric self-play to run Alice to generate starts for Bob. env_alice = AliceEnv(env, env, policy, v['horizon']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # with env.set_kill_outside(): # starts = generate_starts(env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # regularization of the brownian starts # brownian_starts.empty() # brownian_starts.append(starts) # starts = brownian_starts.sample(size=v['num_new_starts']) starts = generate_starts_alice(env_bob=env, env_alice=env_alice, policy_bob=policy, policy_alice=policy_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], alice_factor=v['alice_factor'], log_dir=log_dir) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.record_tabular('starts', starts.size) start_classes, text_labels = convert_label(labels) total_starts = labels.shape[0] logger.record_tabular('GenStarts_evaluated', total_starts) start_class_frac = OrderedDict( ) # this needs to be an ordered dict!! (for the log tabular) for k in text_labels.keys(): frac = np.sum(start_classes == k) / total_starts logger.record_tabular('GenStart_frac_' + text_labels[k], frac) start_class_frac[text_labels[k]] = frac labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(1000) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=1, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) logger.dump_tabular(with_prefix=True) # append new states to list of all starts (replay buffer): Not the low reward ones!! logger.log("Appending good goals to replay and generating seeds") filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize( PointMazeEnv(maze_id=v['maze_id'], length=v['maze_length'])) #inner_env = normalize(PointEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) # initialize all logging arrays on itr0 outer_iter = 0 # TODO - show initial states for Alice report.new_row() ring_spacing = 1 init_iter = 2 # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # if outer_iter > 10: # init_iter = 5 #env_alice.set_iter(init_iter) #import pdb; pdb.set_trace() print("Init iter: " + str(init_iter)) env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) algo_alice.env = env_alice #env_alice.set_iter(outer_iter) starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) # Make fake labels labels = np.ones([len(starts), 2]) radius = init_iter * ring_spacing plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n', radius=radius) report.save() with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.dump_tabular(with_prefix=False) report.new_row()
def test_policy(policy, train_env, as_goals=True, visualize=True, sampling_res=1, n_traj=1, parallel=True, bounds=None, center=None): if parallel: return test_policy_parallel(policy, train_env, as_goals, visualize, sampling_res, n_traj=n_traj, center=center, bounds=bounds) logger.log("Not using the parallel evaluation of the policy!") if hasattr(train_env.wrapped_env, 'find_empty_space'): maze_env = train_env.wrapped_env else: maze_env = train_env.wrapped_env.wrapped_env empty_spaces = maze_env.find_empty_space() old_goal_generator = train_env.goal_generator if hasattr( train_env, 'goal_generator') else None old_start_generator = train_env.start_generator if hasattr( train_env, 'start_generator') else None if quick_test: sampling_res = 0 empty_spaces = empty_spaces[:3] max_path_length = 100 else: max_path_length = 400 size_scaling = maze_env.MAZE_SIZE_SCALING num_samples = 2**sampling_res spacing = size_scaling / num_samples starting_offset = spacing / 2 avg_totRewards = [] avg_success = [] avg_time = [] states = [] distances = [] for empty_space in empty_spaces: delta_x = empty_space[0] # - train_env.wrapped_env._init_torso_x delta_y = empty_space[1] # - train_env.wrapped_env._init_torso_y distance = (delta_x**2 + delta_y**2)**0.5 distances.append(distance) sort_indices = np.argsort(distances)[::-1] empty_spaces = np.array(empty_spaces) empty_spaces = empty_spaces[sort_indices] for empty_space in empty_spaces: starting_x = empty_space[0] - size_scaling / 2 + starting_offset starting_y = empty_space[1] - size_scaling / 2 + starting_offset for i in range(num_samples): for j in range(num_samples): paths = [] x = starting_x + i * spacing y = starting_y + j * spacing if as_goals: goal = (x, y) states.append(goal) train_env.update_goal_selector(FixedStateGenerator(goal)) else: init_state = np.zeros_like(old_start_generator.state) init_state[:2] = (x, y) states.append(init_state) train_env.update_init_selector( FixedStateGenerator(init_state)) print(init_state) for n in range(n_traj): path = rollout(train_env, policy, animated=visualize, max_path_length=max_path_length, speedup=100) paths.append(path) avg_totRewards.append( np.mean([np.sum(path['rewards']) for path in paths])) avg_success.append( np.mean([ int( np.min(path['env_infos']['distance']) <= train_env.terminal_eps) for path in paths ])) avg_time.append( np.mean([path['rewards'].shape[0] for path in paths])) return avg_totRewards, avg_success, states, spacing, avg_time
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=2) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None uniform_start_generator = UniformListStateGenerator( state_list=all_feasible_starts.state_list) init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4]][::-1] for pos in init_pos: pos.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) init_pos = np.array(init_pos) env.update_start_generator(uniform_start_generator) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") # env.update_start_generator(uniform_start_generator) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) algo.train() logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(100) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=3, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, goal_generator=fixed_goal_generator, obs2start_transform=lambda x: x[:v['start_size']], obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['start_center'], limit=v['start_range']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) total_rollouts = 0 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = np.array([]).reshape((-1, v['start_size'])) k = 0 while starts.shape[0] < v['num_new_starts']: print('good starts collected: ', starts.shape[0]) logger.log("Sampling and labeling the starts: %d" % k) k += 1 unif_starts = sample_unif_feas(env, samples_per_cell=samples_per_cell) if v['start_size'] > 2: unif_starts = np.array([np.concatenate([start, np.random.uniform(-v['start_range'], v['start_range'], 2)]) for start in unif_starts]) labels = label_states(unif_starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') # plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['start_range'], # center=v['start_center'], maze_id=v['maze_id']) logger.log("Converting the labels") init_classes, text_labels = convert_label(labels) starts = np.concatenate([starts, unif_starts[init_classes == 2]]).reshape((-1, v['start_size'])) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) # report.new_row() with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], gae_lambda=v['gae_lambda'], plot=False, ) algo.train() logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # rollouts used for labeling (before TRPO itrs): num_empty_spaces = len(unwrap_maze(env).find_empty_space()) logger.record_tabular('LabelingRollouts', k * v['n_traj'] * samples_per_cell * num_empty_spaces) total_rollouts += k * v['n_traj'] * samples_per_cell * num_empty_spaces logger.record_tabular('TotalLabelingRollouts', total_rollouts) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) # test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], # itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal( stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['start_size'], evaluater_size=v['num_labels'], state_range=v['start_range'], state_center=v['start_center'], state_noise_level=v['start_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) logger.log("pretraining the GAN...") if v['smart_init']: feasible_starts = generate_starts( env, starts=[v['ultimate_goal']], horizon=50) # without giving the policy it does brownian mo. labels = np.ones((feasible_starts.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) dis_loss, gen_loss = gan.pretrain(states=feasible_starts, outer_iters=v['gan_outer_iters']) print("Loss of Gen and Dis: ", gen_loss, dis_loss) else: gan.pretrain_uniform(outer_iters=500, report=report) # v['gan_outer_iters']) # log first samples form the GAN initial_starts, _ = gan.sample_states_with_noise(v['num_new_starts']) logger.log("Labeling the starts") labels = label_states(initial_starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) # Sample GAN logger.log("Sampling starts from the GAN") raw_starts, _ = gan.sample_states_with_noise(v['num_new_starts']) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([raw_starts, old_starts]) else: starts = raw_starts with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['start_range'], center=v['start_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Training the GAN") if np.any(labels): gan.train( starts, labels, v['gan_outer_iters'], ) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_start = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_start)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # create Alice env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) if v["baseline"] == "MLP": baseline_alice = GaussianMLPBaseline(env_spec=env.spec) else: baseline_alice = LinearFeatureBaseline(env_spec=env.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) # load the state collection from data_upload all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2]) load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4]][::-1] for pos in init_pos: pos.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) init_pos = np.array(init_pos) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") report.save() starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) if v['filter_bad_starts']: logger.log("Prefilter starts: {}".format(len(starts))) starts = parallel_check_feasibility( env=env, starts=starts, max_path_length=v['feasibility_path_length']) logger.log("Filtered starts: {}".format(len(starts))) logger.log("Total number of starts in buffer: {}".format( all_starts.size)) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log("Labeling the starts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=v['n_traj'], key='goal_reached', # using the min n_traj as_goal=False, env=env) # labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) == 0: # add a tone of noise if all the states I had ended up being high_reward! logger.log("Bad Alice! All goals are high reward!") all_starts.append(filtered_raw_starts) # Useful plotting and metrics (basic test set) # need to put this last! otherwise labels variable gets confused logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(100) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') # report.add_text("Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) inner_env = normalize(AntMazeEnv()) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=v['initial_brownian_horizon'], #TODO: increase to 2000 size=1000, # size speeds up training a bit variance=v['brownian_variance'], animated=True, subsample=v['num_new_starts'], ) # , animated=True, speedup=1) np.random.shuffle(seed_starts) logger.log("Prefilter seed starts: {}".format(len(seed_starts))) # breaks code # starts = seed_starts # # hack to not print code # seed_starts = [start for start in seed_starts if check_feasibility(start, env, 10)] # starts = np.array(starts) # seed_starts = starts logger.log("Filtered seed starts: {}".format(len(seed_starts))) # with env.set_kill_outside(): feasible_states = find_all_feasible_states_plotting( env, seed_starts, report, distance_threshold=0.1, brownian_variance=1, size=8000, animate=True, limit=v['goal_range'], check_feasible=True, check_feasible_path_length=500, center=v['goal_center']) return
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv(maze_id=v['maze_id'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) report.new_row() for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling goals from the GAN") goals = np.random.uniform( np.array(v['goal_center']) - np.array(v['goal_range']), np.array(v['goal_center']) + np.array(v['goal_range']), size=(300, v['goal_size'])) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") if v['unif_goals']: env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) else: env.update_goal_generator(FixedStateGenerator(v['final_goal'])) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) algo.train() logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) logger.log("Labeling the goals") labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) logger.dump_tabular(with_prefix=False) report.new_row()
if args.collection_file: all_feasible_starts = pickle.load(open(args.collection_file, 'rb')) with tf.Session() as sess: data = joblib.load(args.file) if "algo" in data: policy = data["algo"].policy env = data["algo"].env else: policy = data['policy'] env = data['env'] while True: if args.init_state: from curriculum.envs.base import FixedStateGenerator env.update_start_generator(FixedStateGenerator( args.init_state)) elif args.collection_file: from curriculum.envs.base import UniformListStateGenerator init_states = all_feasible_starts.sample(1000) env.update_start_generator( UniformListStateGenerator(init_states)) if args.deterministic: with policy.set_std_to_0(): path = rollout(env, policy, max_path_length=args.max_path_length, animated=True, speedup=args.speedup) else: path = rollout(env, policy,
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 all_starts = StateCollection(distance_threshold=v['coll_eps']) # seed_starts: from which we will be performing brownian motion exploration seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts']) def plot_states(states, report, itr, summary_string, **kwargs): states = np.array(states) if states.size == 0: states = np.zeros((1, 2)) img = plot_labeled_samples( states, np.zeros(len(states), dtype='uint8'), markers={0: 'o'}, text_labels={0: "all"}, **kwargs) report.add_image(img, 'itr: {}\n{}'.format(itr, summary_string), width=500) for outer_iter in range(1, v['outer_iters']): report.new_row() logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") plot_states( seed_starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="seed starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], horizon=v['brownian_horizon'], variance=v['brownian_variance']) plot_states( starts, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="brownian starts") sampled_from_buffer = [] if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: sampled_from_buffer = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, sampled_from_buffer]) plot_states( sampled_from_buffer, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string="states sampled from buffer") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='all starts before update\n') with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states( starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base="all starts after update\n") with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: logger.log("Only goods A") seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward logger.log("Only goods B") seed_starts = all_starts.sample(300) # sample them from the replay else: logger.log("Only goods C") # add a ton of noise if all the states I had ended up being high_reward seed_starts = generate_starts( env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts elif v['seed_with'] == 'on_policy': seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts']) logger.log('Generating Heatmap...') plot_policy_means( policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) _, _, states, returns, successes = test_and_plot_policy2( policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) eval_state_path = osp.join(log_dir, "eval_states.json") if not osp.exists(eval_state_path): with open(eval_state_path, 'w') as f: json.dump(np.array(states).tolist(), f) with open(osp.join(log_dir, 'eval_pos_per_state_mean_return.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(returns) writer.writerow(row) with open(osp.join(log_dir, 'eval_pos_per_state_mean_success.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(successes) writer.writerow(row) logger.dump_tabular() report.save() if outer_iter == 1 or outer_iter % 5 == 0 and v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True) if v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['start_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-1 * v['goal_size']: ], # the goal are the last 9 coords terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=v['policy_hidden_sizes'], # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['baseline'] == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states.pkl'), 'rb')) # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb')) # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad4.pkl'), 'rb')) # all_feasible_starts2 = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_min_rad4.pkl'), 'rb')) # all_feasible_starts3 = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_max_rad2.pkl'), 'rb')) print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) brownian_starts = StateCollection( distance_threshold=v['regularize_starts']) logger.log( 'Generating seed starts from the goal (horizon 10, subsample 600 of them)' ) with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=10, # this is smaller as they are seeds! variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=10) # seed_starts = all_feasible_starts.states # with env.set_kill_outside(radius=0.4): # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False) # # show where these states are: # shuffled_starts = np.array(all_feasible_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], # zero_action=True, animated=True, speedup=10) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") with algo.env.set_kill_outside(radius=v['kill_radius']): starts = generate_starts(algo.env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # regularization of the brownian starts brownian_starts.empty() brownian_starts.append(starts) starts = brownian_starts.sample(size=v['num_new_starts']) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 50 * (outer_iter // 50 + 1), snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") algo.env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) # algo.start_worker() logger.log("Training the algorithm") algo.current_itr = 0 trpo_paths = algo.train(already_init=outer_iter > 1) # import pdb; pdb.set_trace() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=algo.env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, algo.env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): algo.env.log_diagnostics(paths) logger.record_tabular('brownian_starts', brownian_starts.size) start_classes, text_labels = convert_label(labels) total_starts = labels.shape[0] logger.record_tabular('GenStarts_evaluated', total_starts) start_class_frac = OrderedDict( ) # this needs to be an ordered dict!! (for the log tabular) for k in text_labels.keys(): frac = np.sum(start_classes == k) / total_starts logger.record_tabular('GenStart_frac_' + text_labels[k], frac) start_class_frac[text_labels[k]] = frac labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_4med_"): unif_starts = all_feasible_starts.sample(500) unif_starts = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant') mean_reward, paths = evaluate_states(unif_starts, algo.env, policy, v['horizon'], n_traj=1, key='goal_reached', as_goals=False, full_path=True) algo.env.log_diagnostics(paths) # with logger.tabular_prefix("Uniform_4med_bis_"): # unif_starts = all_feasible_starts.sample(200) # unif_starts1bis = np.pad(unif_starts, ((0, v['start_size'] - unif_starts.shape[1])), 'constant') # mean_reward1bis, paths1bis = evaluate_states(unif_starts1bis, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths1bis) # with logger.tabular_prefix("Uniform_4min_"): # unif_starts2 = all_feasible_starts2.sample(200) # unif_starts2 = np.pad(unif_starts2, ((0, v['start_size'] - unif_starts2.shape[1])), 'constant') # mean_reward2, paths2 = evaluate_states(unif_starts2, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths2) # with logger.tabular_prefix("Uniform_2max_"): # unif_starts3 = all_feasible_starts3.sample(200) # unif_starts3 = np.pad(unif_starts3, ((0, v['start_size'] - unif_starts3.shape[1])), 'constant') # mean_reward3, paths3 = evaluate_states(unif_starts3, algo.env, policy, v['horizon'], n_traj=1, # key='goal_reached', as_goals=False, full_path=True) # algo.env.log_diagnostics(paths3) logger.dump_tabular(with_prefix=True) # append new states to list of all starts (replay buffer): if v['seed_with'] == 'only_goods': logger.log("Appending good goals to replay and generating seeds") filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts) if len(filtered_raw_starts) > 0: seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum( start_classes == 1): # if more low reward than high reward seed_starts = all_starts.sample( 300) # sample them from the replay else: # add a tone of noise if all the states I had ended up being high_reward! with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts( algo.env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': logger.log("Appending all goals to replay and generating seeds") all_starts.append(starts) seed_starts = starts elif v['seed_with'] == 'on_policy': all_starts.append(starts) with algo.env.set_kill_outside(radius=v['kill_radius']): seed_starts = generate_starts(algo.env, policy, horizon=v['horizon'], subsample=v['num_new_starts'])
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=1000) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' save_dir = 'data/debug/' # with open(os.path.join(config.PROJECT_PATH, save_dir, "test.pkl"), 'wb') as handle: # pickle.dump({}, handle) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2]) # initial brownian horizon and size are pretty important logger.log("Brownian horizon: {}".format(v['initial_brownian_horizon'])) seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=v['initial_brownian_horizon'], size=15000, variance=v['brownian_variance'], animated=False, ) if v['filter_bad_starts']: logger.log("Prefilter seed starts: {}".format(len(seed_starts))) seed_starts = parallel_check_feasibility( env=env, starts=seed_starts, max_path_length=v['feasibility_path_length']) logger.log("Filtered seed starts: {}".format(len(seed_starts))) # can also filter these starts optionally # all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) # logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4]][::-1] for pos in init_pos: pos.extend([ 0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) init_pos = np.array(init_pos) with open(osp.join(log_dir, 'init_pos.json'), 'w') as f: json.dump(init_pos.tolist(), f) for outer_iter in range(1, v['outer_iters'] + 1): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") report.save() # generate starts from the previous seed starts, which are defined below starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], size=2000, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # note: this messes with the balance between starts and old_starts! if v['filter_bad_starts']: logger.log("Prefilter starts: {}".format(len(starts))) starts = parallel_check_feasibility( env=env, starts=starts, max_path_length=v['feasibility_path_length']) logger.log("Filtered starts: {}".format(len(starts))) logger.log("Total number of starts in buffer: {}".format( all_starts.size)) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: # with open(os.path.join(config.PROJECT_PATH, save_dir, "qval{}.pkl".format(outer_iter)), 'wb') as handle: # pickle.dump(all_starts.q_vals, handle) # with open(os.path.join(config.PROJECT_PATH, save_dir, "preval{}.pkl".format(outer_iter)), 'wb') as handle: # pickle.dump(all_starts.prev_vals, handle) old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) # plot starts before training # takes too much time # labels = label_states(starts, env, policy, v['horizon'], # as_goals=False, n_traj=v['n_traj'], key='goal_reached') # plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], # center=v['goal_center'], maze_id=v['maze_id'], # summary_string_base='initial starts labels:\n') # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() logger.log("Labeling the starts") [starts, labels] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached', as_goal=False, env=env) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len( filtered_raw_starts ) > 0: # add a ton of noise if all the states I had ended up being high_reward! logger.log("We have {} good starts!".format( len(filtered_raw_starts))) seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum( start_classes == 1): # if more low reward than high reward logger.log( "More bad starts than good starts, sampling seeds from replay buffer" ) seed_starts = all_starts.sample( 300) # sample them from the replay else: logger.log("More good starts than bad starts, resampling") seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000, variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts filtered_raw_starts = starts # no filtering done else: raise Exception # need to put this last! otherwise labels variable gets confused logger.log("Labeling on uniform starts") if not v["debug"]: # with logger.tabular_prefix("Uniform_"): # unif_starts = all_feasible_starts.sample(100) # mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', # as_goals=False, full_path=True) # env.log_diagnostics(paths) # mean_rewards = mean_reward.reshape(-1, 1) # labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, # improvement_threshold=improvement_threshold) # logger.log("Starts labelled") # plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], # center=v['goal_center'], maze_id=v['maze_id'], # summary_string_base='initial starts labels:\n') # report.add_text("Uniform Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) with open( osp.join(log_dir, 'init_pos_per_state_mean_return.csv'), 'a') as f: writer = csv.writer(f) row = [outer_iter] + list(mean_reward) writer.writerow(row) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels( mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states( init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular() if outer_iter == 1 or outer_iter % 5 == 0 and v.get( 'scratch_dir', False): command = 'rsync -a --delete {} {}'.format( os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True) if v.get('scratch_dir', False): command = 'rsync -a {} {}'.format(os.path.join(log_dir, ''), os.path.join(v['scratch_dir'], '')) print("Running command:\n{}".format(command)) subprocess.run(command.split(), check=True)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(Arm3dKeyEnv(ctrl_cost_coeff=v['ctrl_cost_coeff'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['start_out']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-1 * v['goal_size']: ], # the goal are the last 9 coords terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['baseline'] == 'linear': baseline = LinearFeatureBaseline(env_spec=env.spec) elif v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # load the state collection from data_upload # load the state collection from data_upload load_dir = 'data_upload/state_collections/' all_feasible_starts = pickle.load( # open(osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_states_med_rad2.pkl'), 'rb')) open( osp.join(config.PROJECT_PATH, load_dir, 'key_all_feasible_04_230000.pkl'), 'rb')) uniform_start_generator = UniformListStateGenerator( state_list=all_feasible_starts.state_list) env.update_start_generator(uniform_start_generator) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'] * v['outer_iters'], step_size=0.01, discount=v['discount'], plot=False, ) algo.train()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) #baseline = LinearFeatureBaseline(env_spec=env.spec) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2]) # can also filter these starts optionally load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' all_feasible_starts = pickle.load( open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None # hardest to easiest init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4] ][::-1] for pos in init_pos: pos.extend([0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) array_init_pos = np.array(init_pos) init_pos = [tuple(pos) for pos in init_pos] online_start_generator = Online_TCSL(init_pos) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") report.save() # generate starts from the previous seed starts, which are defined below dist = online_start_generator.get_distribution() # added logger.log(np.array_str(online_start_generator.get_q())) # how to log Q values? # with logger.tabular_prefix("General: "): # logger.record_tabular("Q values:", online_start_generator.get_q()) logger.log(np.array_str(dist)) # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") #TODO: might be faster to sample if we just create a roughly representative UniformListStateGenerator? env.update_start_generator( ListStateGenerator( init_pos, dist ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() logger.log("Labeling the starts") [starts, labels, mean_rewards, updated] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached', # using the min n_traj as_goal=False, env=env, return_mean_rewards=True, order_of_states=init_pos) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) online_start_generator.update_q(np.array(mean_rewards), np.array(updated)) # added labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: # add a ton of noise if all the states I had ended up being high_reward! logger.log("We have {} good starts!".format(len(filtered_raw_starts))) seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward logger.log("More bad starts than good starts, sampling seeds from replay buffer") seed_starts = all_starts.sample(300) # sample them from the replay else: logger.log("More good starts than bad starts, resampling") seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000, variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts else: raise Exception all_starts.append(filtered_raw_starts) # need to put this last! otherwise labels variable gets confused logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(100) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') # report.add_text("Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(array_init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(array_init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceEnv(env_alice=env, env_bob=env, policy_bob=policy, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=v['step_size'], discount=v['discount'], plot=False, ) # We don't use these labels anyway, so we might as well take them from training. #trpo_paths = algo.train() algo.train() # logger.log("labeling starts with trpo rollouts") # [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj # as_goal=False, env=env) # paths = [path for paths in trpo_paths for path in paths] with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=sampling_res, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) == 0: # add a tone of noise if all the states I had ended up being high_reward! logger.log("Bad Alice! All goals are high reward!") # seed_starts = filtered_raw_starts # else: # seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], # variance=v['brownian_variance'] * 10) all_starts.append(filtered_raw_starts)