def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/davheld/repos/rllab_goal_rl/data/local/debug" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize( PointMazeEnv(maze_id=v['maze_id'], length=v['maze_length'])) #inner_env = normalize(PointEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) # initialize all logging arrays on itr0 outer_iter = 0 # TODO - show initial states for Alice report.new_row() ring_spacing = 1 init_iter = 2 # Use asymmetric self-play to run Alice to generate starts for Bob. # Use a double horizon because the horizon is shared between Alice and Bob. env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['alice_horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # if outer_iter > 10: # init_iter = 5 #env_alice.set_iter(init_iter) #import pdb; pdb.set_trace() print("Init iter: " + str(init_iter)) env_alice = AliceFakeEnv(env, max_path_length=v['alice_horizon'], alice_factor=v['alice_factor'], alice_bonus=v['alice_bonus'], gamma=1, stop_threshold=v['stop_threshold'], ring_spacing=ring_spacing, init_iter=init_iter) algo_alice.env = env_alice #env_alice.set_iter(outer_iter) starts, t_alices = generate_starts_alice( env_alice=env_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], log_dir=log_dir) # Make fake labels labels = np.ones([len(starts), 2]) radius = init_iter * ring_spacing plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n', radius=radius) report.save() with logger.tabular_prefix('Outer_'): logger.record_tabular('t_alices', np.mean(t_alices)) logger.dump_tabular(with_prefix=False) report.new_row()