def run_trpo(vv): setup_rllab_logging(vv) path_len = vv['path_len'] env = get_env(vv) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(300, 200, 100), init_std=20.0, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=100 * path_len, max_path_length=path_len, n_itr=1000, discount=0.99, step_size=0.01, ) algo.train()
def run_task(*_): env = normalize(Arm3dKeyEnv()) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=500, n_itr=1000, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting plot=True, ) algo.train()
def run_task(*_): # env = normalize(SwimmerWrapperGym('Swimmer-v1')) env = normalize(GymEnv('Swimmer-v1')) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), learn_std=True) print('horizon {}'.format(env.horizon)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=200, discount=0.99, step_size=0.01, ) algo.train()
def run_experiment(**params): base_params = copy.copy(DEFAULTS) base_params.update(params) params = base_params grid_world = SlaveGridWorldEnv("3x3", goal_reward=params["goal_reward"]) env = normalize(grid_world) baseline = LinearFeatureBaseline(env) policy = CategoricalMLPPolicy( name="policy", env_spec=env.spec, hidden_sizes=params["policy_hidden_dims"], ) optimizer = ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp( base_eps=1e-5)) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=params["batch_size"], max_path_length=5, n_itr=params["n_itr"], discount=0.99, step_size=params["step_size"], optimizer=optimizer, ) run_experiment_lite( algo.train(), n_parallel=5, snapshot_mode="last", exp_prefix="grid_world_silent", variant=params, )
def main(args): logger.set_snapshot_dir(args.snapshot_dir) logger.set_snapshot_mode("none") logger.add_tabular_output(os.path.join(args.snapshot_dir, "tabular.csv")) env = GymEnv(args.env_id) # If the user provided a starting policy, use it. Otherwise, we start with # a fresh policy. if args.input_policy is not None: with open(args.input_policy, "rb") as f: policy = pickle.load(f) else: policy = CategoricalMLPPolicy(env_spec=env.spec, hidden_sizes=args.hidden_sizes) # policy = CategoricalMLPPolicy( # env_spec=env.spec, # hidden_sizes=(16, 16), # hidden_nonlinearity=lasagne.nonlinearities.rectify) # policy = CategoricalGRUPolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=args.batch_size, max_path_length=env.horizon, n_itr=args.n_itr, discount=args.discount, step_size=args.step_size, gae_lambda=args.gae_lambda, ) algo.train() with open(args.output_policy, "wb") as f: pickle.dump(policy, f)
def switch_from_slide(exp_name="Switch_Slide", num=1, directory="./Results/Car/SlideTurn/", save=True): rccar = RCCarSlideTurn(noise=0.1) env = RLPyEnv(rccar) now = datetime.datetime.now() timestamp = now.strftime('%Y_%m_%d_%H_%M_%S') import joblib data = joblib.load("Results/Car/Slide/Base/params.pkl") # LOAD POLICY policy = data['policy'] baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=100, discount=0.9, step_size=0.01, # plot=True, ) # algo.train() # rollout(env, policy) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", script="scripts/run_experiment_lite_rl.py", exp_name=exp_name + timestamp, log_dir=os.path.join(directory, exp_name) if save else './Results/Tmp', # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, # plot=True, )
def run_task(*_): env = normalize(GymEnv(models[k])) baseline = LinearFeatureBaseline(env_spec=env.spec) learn_std = True init_std = 1 hidden_sizes = NN_sizes[i] # hidden_sizes=(8,) # hidden_sizes=(32, 32) # hidden_sizes=(100, 50, 25) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=hidden_sizes, learn_std=learn_std, init_std=init_std) # ======================= # Defining the algorithm # ======================= batch_size = 5000 n_itr = 200 gamma = .99 step_size = 0.01 # max_path_length = 96, algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=batch_size, # max_path_length=max_path_length, n_itr=n_itr, discount=gamma, step_size=step_size) algo.train()
def run_task(*_): env = normalize(GymEnv("Pendulum-v0", record_video=False)) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8, 8) ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train()
def train_domain(domain): rc = domain() env = RLPyEnv(rc) # env = ControllerEnv(k=10) policy = CategoricalMLPPolicy( env_spec=env.spec, hidden_sizes=( 16, 16, ), ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=100, discount=0.995, step_size=0.01, plot=False, ) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=4, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", script="scripts/run_experiment_lite_rl.py", # script="scripts/run_experiment_lite.py", log_dir="models/rc_gradient/" + domain.proxy_class.__name__, # Specifies the seed for the experiment. If this is not provided, a random seed # will be used # plot=True, )
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) #baseline = LinearFeatureBaseline(env_spec=env.spec) if v["baseline"] == "MLP": baseline = GaussianMLPBaseline(env_spec=env.spec) else: baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload all_starts = StateCollection(distance_threshold=v['coll_eps'], states_transform=lambda x: x[:, :2]) # can also filter these starts optionally load_dir = 'sandbox/young_clgan/experiments/starts/maze/maze_ant/' all_feasible_starts = pickle.load( open(osp.join(config.PROJECT_PATH, load_dir, 'good_all_feasible_starts.pkl'), 'rb')) logger.log("We have %d feasible starts" % all_feasible_starts.size) min_reward = 0.1 max_reward = 0.9 improvement_threshold = 0 old_rewards = None # hardest to easiest init_pos = [[0, 0], [1, 0], [2, 0], [3, 0], [4, 0], [4, 1], [4, 2], [4, 3], [4, 4], [3, 4], [2, 4], [1, 4] ][::-1] for pos in init_pos: pos.extend([0.55, 1, 0, 0, 0, 0, 1, 0, -1, 0, -1, 0, 1, ]) array_init_pos = np.array(init_pos) init_pos = [tuple(pos) for pos in init_pos] online_start_generator = Online_TCSL(init_pos) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") report.save() # generate starts from the previous seed starts, which are defined below dist = online_start_generator.get_distribution() # added logger.log(np.array_str(online_start_generator.get_q())) # how to log Q values? # with logger.tabular_prefix("General: "): # logger.record_tabular("Q values:", online_start_generator.get_q()) logger.log(np.array_str(dist)) # Following code should be indented with ExperimentLogger(log_dir, outer_iter // 50, snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") #TODO: might be faster to sample if we just create a roughly representative UniformListStateGenerator? env.update_start_generator( ListStateGenerator( init_pos, dist ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() logger.log("Labeling the starts") [starts, labels, mean_rewards, updated] = label_states_from_paths(trpo_paths, n_traj=v['n_traj'], key='goal_reached', # using the min n_traj as_goal=False, env=env, return_mean_rewards=True, order_of_states=init_pos) start_classes, text_labels = convert_label(labels) plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) online_start_generator.update_q(np.array(mean_rewards), np.array(updated)) # added labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: # add a ton of noise if all the states I had ended up being high_reward! logger.log("We have {} good starts!".format(len(filtered_raw_starts))) seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward logger.log("More bad starts than good starts, sampling seeds from replay buffer") seed_starts = all_starts.sample(300) # sample them from the replay else: logger.log("More good starts than bad starts, resampling") seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=10000, variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts else: raise Exception all_starts.append(filtered_raw_starts) # need to put this last! otherwise labels variable gets confused logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(100) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(unif_starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') # report.add_text("Success: " + str(np.mean(mean_reward))) with logger.tabular_prefix("Fixed_"): mean_reward, paths = evaluate_states(array_init_pos, env, policy, v['horizon'], n_traj=5, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) mean_rewards = mean_reward.reshape(-1, 1) labels = compute_labels(mean_rewards, old_rewards=old_rewards, min_reward=min_reward, max_reward=max_reward, improvement_threshold=improvement_threshold) logger.log("Starts labelled") plot_labeled_states(array_init_pos, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.add_text("Fixed Success: " + str(np.mean(mean_reward))) report.new_row() report.save() logger.record_tabular("Fixed test set_success: ", np.mean(mean_reward)) logger.dump_tabular()
nonED = True if(nonED): # logger.add_tabular_output('./NonED.log') env = NonEDFirestorm_SingleAgent_Env() discount = env.discount env = normalize(env) policy = CategoricalMLPPolicy( env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, discount=discount, n_itr=75 ) else: logger.add_tabular_output('./ED.log') env = normalize(EDFirestorm_SingleAgent_Env()) policy = CategoricalMLPPolicy( env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline,
random=RANDOM, observable_noise=False, zero_gradient_cutoff= org_env_size, # zero out gradients except for config params use_max_norm=MAX_NORM, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=5000, max_path_length=env.horizon, n_itr=NUM_ITERS, discount=0.995, step_size=0.01, gae_lambda=0.97, sampler_args={'n_workers': 2}, plot_learning_curve=GENERATE_PLOTS, trial=trial, ) avg_rewards, std_rewards = algo.train() print('trial {}'.format(trial)) saveModel( algo.policy, 'policy_{}_{}_{}_{}_{}_{}_{}_{}'.format(ENV_NAME, TRAIN_ADVERSARIAL, NUM_ITERS, PROBABILITY, EPS, MAX_NORM, USE_DYNAMICS, trial))
def train(env_ind, config_num, num_agents): # get the original state space size first org_env = GymEnv(original_environments[env_ind]) org_env_size = org_env.observation_space.shape[0] org_env.terminate() # the environment env = GymEnv(dynamic_environments[env_ind]) # the configuration settings curriculum_config = curriculum_configs[config_num] if args.env_ind == 0: # batch size for Inverted Pendulum curriculum_config.set_batch_size(5000) else: # batch size for all other environments curriculum_config.set_batch_size(25000) # the nominal config config = curriculum_config.curriculum_list[0] for agent_num in range(num_agents): # define policy by reading from config class policy = CurriculumPolicy( env_spec=env.spec, hidden_sizes=config.hidden_sizes, adaptive_std=config.adaptive_std, adversarial=config.adversarial, eps=config.eps, probability=config.probability, use_dynamics=config.use_dynamics, random=config.random, observable_noise=config.observable_noise, zero_gradient_cutoff=org_env_size, use_max_norm=config.use_max_norm, curriculum_list=list(curriculum_config.curriculum_list), update_freq=curriculum_config.update_freq, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=config.batch_size, max_path_length=env.horizon, n_itr=config.num_iter, discount=config.discount, step_size=config.step_size, gae_lambda=config.gae_lambda, num_workers=config.num_workers, plot_learning_curve=config.plot_learning_curve, trial=agent_num, ) avg_rewards, std_rewards = algo.train() print("training completed!") saveModel( algo.policy, 'policy_{}_config_{}_agent_{}'.format( dynamic_environments[env_ind], config_num, agent_num)) # save rewards per model over the iterations if config.plot_learning_curve: saveModel([range(config.num_iter), avg_rewards, std_rewards], 'rewards_{}_config_{}_agent_{}'.format( dynamic_environments[env_ind], config_num, agent_num))
env = (GymEnv("ClothEnv-v0")) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(8, 8)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=env.horizon, n_itr=50, discount=0.99, step_size=0.01, # Uncomment both lines (this and the plot parameter below) to enable plotting # plot=True, ) algo.train() # run_experiment_lite( # algo.train(), # # Number of parallel workers for sampling # n_parallel=1, # # Only keep the snapshot parameters for the last iteration # snapshot_mode="last", # # Specifies the seed for the experiment. If this is not provided, a random seed # # will be used # seed=1,
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] samples_per_cell = 10 # for the oracle rejection sampling # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=5) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'])) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) uniform_start_generator = UniformStateGenerator(state_size=v['start_size'], bounds=v['start_range'], center=v['start_center']) env = GoalStartExplorationEnv( env=inner_env, start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) if v['constant_baseline']: logger.log("Using constant baseline") baseline = ConstantBaseline(env_spec=env.spec, value=1.0) else: logger.log("Using linear baseline") baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) # use goal for plot report.new_row() all_starts = StateCollection(distance_threshold=v['coll_eps']) seed_starts = generate_starts(env, starts=[v['ultimate_goal']], subsample=v['num_new_starts']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], horizon=v['brownian_horizon'], variance=v['brownian_variance']) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") #labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) start_classes, text_labels = convert_label(labels) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [start for start, label in zip(starts, labels) if label[0] == 1] all_starts.append(filtered_raw_starts) if v['seed_with'] == 'only_goods': if len(filtered_raw_starts) > 0: # add a tone of noise if all the states I had ended up being high_reward! seed_starts = filtered_raw_starts elif np.sum(start_classes == 0) > np.sum(start_classes == 1): # if more low reward than high reward seed_starts = all_starts.sample(300) # sample them from the replay else: seed_starts = generate_starts(env, starts=starts, horizon=int(v['horizon'] * 10), subsample=v['num_new_starts'], variance=v['brownian_variance'] * 10) elif v['seed_with'] == 'all_previous': seed_starts = starts elif v['seed_with'] == 'on_policy': seed_starts = generate_starts(env, policy, starts=starts, horizon=v['horizon'], subsample=v['num_new_starts'])
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) report.save() inner_env = normalize(Arm3dDiscEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-1 * v['goal_size']:], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' all_feasible_starts = pickle.load( open( osp.join(config.PROJECT_PATH, load_dir, 'disc_all_feasible_states_min.pkl'), 'rb')) print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) # brownian_starts = StateCollection(distance_threshold=v['regularize_starts']) # with env.set_kill_outside(): # seed_starts = generate_starts(env, starts=[v['start_goal']], horizon=10, # this is smaller as they are seeds! # variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=1) # # with env.set_kill_outside(): # find_all_feasible_states(env, seed_starts, distance_threshold=0.1, brownian_variance=1, animate=False) # show where these states are: # shuffled_starts = np.array(all_feasible_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=shuffled_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10) # Use asymmetric self-play to run Alice to generate starts for Bob. env_alice = AliceEnv(env, env, policy, v['horizon']) policy_alice = GaussianMLPPolicy( env_spec=env_alice.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain_alice'], init_std=v['policy_init_std_alice'], ) baseline_alice = LinearFeatureBaseline(env_spec=env_alice.spec) algo_alice = TRPO( env=env_alice, policy=policy_alice, baseline=baseline_alice, batch_size=v['pg_batch_size_alice'], max_path_length=v['horizon'], n_itr=v['inner_iters_alice'], step_size=0.01, discount=v['discount_alice'], plot=False, ) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") # with env.set_kill_outside(): # starts = generate_starts(env, starts=seed_starts, horizon=v['brownian_horizon'], variance=v['brownian_variance']) # regularization of the brownian starts # brownian_starts.empty() # brownian_starts.append(starts) # starts = brownian_starts.sample(size=v['num_new_starts']) starts = generate_starts_alice(env_bob=env, env_alice=env_alice, policy_bob=policy, policy_alice=policy_alice, algo_alice=algo_alice, start_states=[v['start_goal']], num_new_starts=v['num_new_starts'], alice_factor=v['alice_factor'], log_dir=log_dir) if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, discount=v['discount'], plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [starts, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=False, env=env) paths = [path for paths in trpo_paths for path in paths] else: logger.log("labeling starts manually") labels, paths = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.record_tabular('starts', starts.size) start_classes, text_labels = convert_label(labels) total_starts = labels.shape[0] logger.record_tabular('GenStarts_evaluated', total_starts) start_class_frac = OrderedDict( ) # this needs to be an ordered dict!! (for the log tabular) for k in text_labels.keys(): frac = np.sum(start_classes == k) / total_starts logger.record_tabular('GenStart_frac_' + text_labels[k], frac) start_class_frac[text_labels[k]] = frac labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Labeling on uniform starts") with logger.tabular_prefix("Uniform_"): unif_starts = all_feasible_starts.sample(1000) mean_reward, paths = evaluate_states(unif_starts, env, policy, v['horizon'], n_traj=1, key='goal_reached', as_goals=False, full_path=True) env.log_diagnostics(paths) logger.dump_tabular(with_prefix=True) # append new states to list of all starts (replay buffer): Not the low reward ones!! logger.log("Appending good goals to replay and generating seeds") filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] all_starts.append(filtered_raw_starts)
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) tf_session = tf.Session() inner_env = normalize(AntMazeEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) # GAN logger.log("Instantiating the GAN...") gan_configs = {key[4:]: value for key, value in v.items() if 'GAN_' in key} for key, value in gan_configs.items(): if value is tf.train.AdamOptimizer: gan_configs[key] = tf.train.AdamOptimizer(gan_configs[key + '_stepSize']) if value is tflearn.initializations.truncated_normal: gan_configs[key] = tflearn.initializations.truncated_normal(stddev=gan_configs[key + '_stddev']) gan = StateGAN( state_size=v['goal_size'], evaluater_size=v['num_labels'], state_range=v['goal_range'], state_center=v['goal_center'], state_noise_level=v['goal_noise_level'], generator_layers=v['gan_generator_layers'], discriminator_layers=v['gan_discriminator_layers'], noise_size=v['gan_noise_size'], tf_session=tf_session, configs=gan_configs, ) logger.log("pretraining the GAN...") if v['smart_init']: feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) labels = np.ones((feasible_goals.shape[0], 2)).astype(np.float32) # make them all good goals plot_labeled_states(feasible_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) dis_loss, gen_loss = gan.pretrain(states=feasible_goals, outer_iters=v['gan_outer_iters']) print("Loss of Gen and Dis: ", gen_loss, dis_loss) else: gan.pretrain_uniform() # log first samples form the GAN initial_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) logger.log("Labeling the goals") labels = label_states(initial_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(initial_goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) # Sample GAN logger.log("Sampling goals from the GAN") raw_goals, _ = gan.sample_states_with_noise(v['num_new_goals']) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample(v['num_old_goals']) goals = np.vstack([raw_goals, old_goals]) else: goals = raw_goals # if needed label the goals before any update if v['label_with_variation']: old_labels, old_rewards = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=False, return_rew=True) # itr_label = outer_iter # use outer_iter to log everything or "last" to log only the last # with ExperimentLogger(log_dir, itr_label, snapshot_mode='last', hold_outter_log=True): with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() if v['use_trpo_paths']: logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths(trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] elif v['label_with_variation']: labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', old_rewards=old_rewards, full_path=True) else: logger.log("labeling starts manually") labels, paths = label_states(goals, env, policy, v['horizon'], as_goals=True, n_traj=v['n_traj'], key='goal_reached', full_path=True) with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center']) #logger.log("Labeling the goals") #labels = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) if v['label_with_variation']: # this will use only the performance variation for labeling labels = np.array(labels[:, -1], dtype=int).reshape((-1, 1)) else: labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.log("Training the GAN") gan.train( goals, labels, v['gan_outer_iters'], ) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [goal for goal, label in zip(goals, labels) if label[0] == 1] all_goals.append(filtered_raw_goals) if v['add_on_policy']: logger.log("sampling on policy") feasible_goals = generate_initial_goals(env, policy, v['goal_range'], goal_center=v['goal_center'], horizon=v['horizon']) # downsampled_feasible_goals = feasible_goals[np.random.choice(feasible_goals.shape[0], v['add_on_policy']),:] all_goals.append(feasible_goals)
def run_task(*_): """Implement the run_task method needed to run experiments with rllab.""" V_ENTER = 30 INNER_LENGTH = 300 LONG_LENGTH = 100 SHORT_LENGTH = 300 N_ROWS = 3 N_COLUMNS = 3 NUM_CARS_LEFT = 1 NUM_CARS_RIGHT = 1 NUM_CARS_TOP = 1 NUM_CARS_BOT = 1 tot_cars = (NUM_CARS_LEFT + NUM_CARS_RIGHT) * N_COLUMNS \ + (NUM_CARS_BOT + NUM_CARS_TOP) * N_ROWS grid_array = { "short_length": SHORT_LENGTH, "inner_length": INNER_LENGTH, "long_length": LONG_LENGTH, "row_num": N_ROWS, "col_num": N_COLUMNS, "cars_left": NUM_CARS_LEFT, "cars_right": NUM_CARS_RIGHT, "cars_top": NUM_CARS_TOP, "cars_bot": NUM_CARS_BOT } sim_params = SumoParams(sim_step=1, render=True) vehicles = VehicleParams() vehicles.add(veh_id="idm", acceleration_controller=(SimCarFollowingController, {}), car_following_params=SumoCarFollowingParams( min_gap=2.5, tau=1.1, max_speed=V_ENTER, speed_mode="all_checks"), routing_controller=(GridRouter, {}), num_vehicles=tot_cars) tl_logic = TrafficLightParams(baseline=False) additional_env_params = { "target_velocity": 50, "switch_time": 3.0, "num_observed": 2, "discrete": False, "tl_type": "controlled" } env_params = EnvParams(additional_params=additional_env_params) additional_net_params = { "speed_limit": 35, "grid_array": grid_array, "horizontal_lanes": 1, "vertical_lanes": 1 } if USE_INFLOWS: initial_config, net_params = get_flow_params( v_enter=V_ENTER, vehs_per_hour=EDGE_INFLOW, col_num=N_COLUMNS, row_num=N_ROWS, add_net_params=additional_net_params) else: initial_config, net_params = get_non_flow_params( V_ENTER, additional_net_params) scenario = SimpleGridScenario(name="grid-intersection", vehicles=vehicles, net_params=net_params, initial_config=initial_config, traffic_lights=tl_logic) env_name = "PO_TrafficLightGridEnv" pass_params = (env_name, sim_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=40000, max_path_length=horizon, # whole_paths=True, n_itr=800, discount=0.999, # step_size=0.01, ) algo.train()
algo = TRPO( env=env, policy=policy, baseline=baseline, baselinename=par.baseline_name, self_normalize=True, log_deterministic=True, batch_size=batch_size, whole_paths=True, max_path_length=max_path_length, n_itr=par.n_itr, discount=0.99, # not used discount_low=0.99, discount_high=par. discount_high, # 0.99 prev, Rui wants to change it to 0.8 (for time_step_agg == 100) train_high_every=par. train_high_every, # 1 prev, Rui wants to change it to 10 (for time_step_agg == 100) step_size=0.01, train_low=par.train_low, train_high=par.train_high, train_low_with_penalty=par.train_low_with_penalty, train_low_with_v_split=par.train_low_with_v_split, train_low_with_v_gradient=par.train_low_with_v_gradient, train_low_with_external=par.train_low_with_external, time_step_agg_anneal=par.time_step_agg_anneal, anneal_base_number=par.anneal_base_number, total_low_step=par.low_step_num, episode_max_low_step=par.max_low_step, low_level_entropy_penalty=par.low_level_entropy_penalty, itr_delay=par.itr_delay, transfer=par.transfer, transfer_high=par.transfer_high, warm_path=par.warm_path, )
from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp from rllab.misc.instrument import stub, run_experiment_lite stub(globals()) env = normalize(CartpoleEnv()) policy = GaussianGRUPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=10, discount=0.99, step_size=0.01, optimizer=ConjugateGradientOptimizer( hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))) run_experiment_lite( algo.train(), n_parallel=1, seed=1, )
def train(env, policy, policy_init, num_episodes, episode_cap, horizon, **alg_args): # Getting the environment env_class = rllab_env_from_name(env) env = normalize(env_class()) # Policy initialization if policy_init == 'zeros': initializer = LI.Constant(0) elif policy_init == 'normal': initializer = LI.Normal() else: raise Exception('Unrecognized policy initialization.') # Setting the policy type if policy == 'linear': hidden_sizes = tuple() elif policy == 'simple-nn': hidden_sizes = [16] else: raise Exception('NOT IMPLEMENTED.') # Creating the policy obs_dim = env.observation_space.flat_dim action_dim = env.action_space.flat_dim mean_network = MLP( input_shape=(obs_dim, ), output_dim=action_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=NL.tanh, output_nonlinearity=None, output_b_init=None, output_W_init=initializer, ) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=hidden_sizes, mean_network=mean_network, log_weights=True, ) # Creating baseline baseline = LinearFeatureBaseline(env_spec=env.spec) # Adding max_episodes constraint. If -1, this is unbounded if episode_cap: alg_args['max_episodes'] = num_episodes # Run algorithm algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=horizon * num_episodes, whole_paths=True, max_path_length=horizon, **alg_args) algo.train() print('----- ENDING ------') print(policy.get_param_values())
cart_env = normalize(CartpoleEnv()) env = GaussianNoiseEnv(cart_env, sigma=0.2) policy = GaussianMLPPolicy( env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=40, discount=0.99, step_size=0.01, plot=True) run_experiment_lite( algo.train(), # Number of parallel workers for sampling n_parallel=1, # Only keep the snapshot parameters for the last iteration snapshot_mode="last", # Specifies the seed for the experiment. If this is not provided, a random seed # will be used seed=1, plot=True,
from rllab.algos.trpo import TRPO from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from custom_env.reacher_mod import ReacherEnv from rllab.envs.normalized_env import normalize from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy env = normalize(ReacherEnv()) policy = GaussianMLPPolicy(env_spec=env.spec, ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, ) algo.train() observation = env.reset() for _ in range(5000): env.render() action, _ = policy.get_action(observation) observation, reward, terminal, _ = env.step(action) if terminal: break
baseline = LinearFeatureBaseline(env_spec=env.spec) # baseline = GaussianMLPBaseline(env_spec=env.spec) optimizer_args = dict( # debug_nan=True, # reg_coeff=0.1, # cg_iters=2 ) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=4000, max_path_length=100, n_itr=200, discount=0.99, step_size=0.01, sampler_cls=ISSampler, sampler_args=dict(n_backtrack=1), optimizer_args=optimizer_args ) # algo.train() run_experiment_lite( stub_method_call=algo.train(), mode="local", use_cloudpickle=False, pre_commands=['pip install --upgrade pip', 'pip install --upgrade theano', ], # Number of parallel workers for sampling
def run_task(*_): sumo_params = SumoParams(sim_step=0.2, sumo_binary="sumo-gui") # note that the vehicles are added sequentially by the generator, # so place the merging vehicles after the vehicles in the ring vehicles = Vehicles() # Inner ring vehicles vehicles.add(veh_id="human", acceleration_controller=(IDMController, { "noise": 0.2 }), lane_change_controller=(SumoLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=6, sumo_car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5), sumo_lc_params=SumoLaneChangeParams()) # A single learning agent in the inner ring vehicles.add(veh_id="rl", acceleration_controller=(RLController, {}), lane_change_controller=(SumoLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), speed_mode="no_collide", num_vehicles=1, sumo_car_following_params=SumoCarFollowingParams(minGap=0.01, tau=0.5), sumo_lc_params=SumoLaneChangeParams()) # Outer ring vehicles vehicles.add(veh_id="merge-human", acceleration_controller=(IDMController, { "noise": 0.2 }), lane_change_controller=(SumoLaneChangeController, {}), routing_controller=(ContinuousRouter, {}), num_vehicles=10, sumo_car_following_params=SumoCarFollowingParams(minGap=0.0, tau=0.5), sumo_lc_params=SumoLaneChangeParams()) env_params = EnvParams(horizon=HORIZON, additional_params={ "max_accel": 3, "max_decel": 3, "target_velocity": 10, "n_preceding": 2, "n_following": 2, "n_merging_in": 2, }) additional_net_params = ADDITIONAL_NET_PARAMS.copy() additional_net_params["ring_radius"] = 50 additional_net_params["inner_lanes"] = 1 additional_net_params["outer_lanes"] = 1 additional_net_params["lane_length"] = 75 net_params = NetParams(no_internal_links=False, additional_params=additional_net_params) initial_config = InitialConfig(x0=50, spacing="uniform", additional_params={"merge_bunching": 0}) scenario = TwoLoopsOneMergingScenario( name=exp_tag, generator_class=TwoLoopOneMergingGenerator, vehicles=vehicles, net_params=net_params, initial_config=initial_config) env_name = "TwoLoopsMergePOEnv" pass_params = (env_name, sumo_params, vehicles, env_params, net_params, initial_config, scenario) env = GymEnv(env_name, record_video=False, register_params=pass_params) horizon = env.horizon env = normalize(env) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(100, 50, 25)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=64 * 3 * horizon, max_path_length=horizon, # whole_paths=True, n_itr=1000, discount=0.999, # step_size=0.01, ) algo.train()
hidden_sizes=layer_size, is_protagonist=False ) adv_baseline = LinearFeatureBaseline(env_spec=env.spec) ## Initializing the parallel sampler ## parallel_sampler.initialize(n_process) ## Optimizer for the Protagonist ## pro_algo = TRPO( env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size, max_path_length=path_length, n_itr=n_pro_itr, discount=0.995, gae_lambda=gae_lambda, step_size=step_size, is_protagonist=True ) ## Optimizer for the Adversary ## adv_algo = TRPO( env=env, pro_policy=pro_policy, adv_policy=adv_policy, pro_baseline=pro_baseline, adv_baseline=adv_baseline, batch_size=batch_size,
"std_hidden_sizes": (32, 32), "std_nonlinearity": None, "normalize_inputs": True, "normalize_outputs": True, }) algo = TRPO( env=env, policy=policy, baseline=baseline, n_itr=3000, max_path_lenght=2000, experiment_spec=experiment_spec, save_policy_every=save_policy_every, batch_size=30000, discount=0.995, gae_lambda=0.98, step_size=0.01, # baseline #discriminator=None, # GAN imitation discriminator=discriminator, ) algo.train(), pickle.dump(policy, open("model/model1.pickle", "wb")) # except Exception as e: # exc_type, exc_value, exc_traceback = sys.exc_info()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 0 if 'sampling_res' not in v.keys() else v['sampling_res'] unif_samples = 300 # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=3) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(AntEnv()) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) env = GoalExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], append_transformed_obs=v['append_transformed_obs'], append_extra_info=v['append_extra_info'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) if v['baseline'] == 'g_mlp': baseline = GaussianMLPBaseline(env_spec=env.spec) # initialize all logging arrays on itr0 outer_iter = 0 logger.log('Generating the Initial Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) report.new_row() all_goals = StateCollection(distance_threshold=v['coll_eps']) total_rollouts = 0 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling goals") goals = np.array([]).reshape((-1, v['goal_size'])) k = 0 while goals.shape[0] < v['num_new_goals']: print('good goals collected: ', goals.shape[0]) logger.log("Sampling and labeling the goals: %d" % k) k += 1 unif_goals = np.random.uniform( np.array(v['goal_center']) - np.array(v['goal_range']), np.array(v['goal_center']) + np.array(v['goal_range']), size=(unif_samples, v['goal_size'])) labels = label_states(unif_goals, env, policy, v['horizon'], n_traj=v['n_traj'], key='goal_reached') logger.log("Converting the labels") init_classes, text_labels = convert_label(labels) goals = np.concatenate([goals, unif_goals[init_classes == 2]]).reshape( (-1, v['goal_size'])) if v['replay_buffer'] and outer_iter > 0 and all_goals.size > 0: old_goals = all_goals.sample( v['num_old_goals']) #todo: replay noise? goals = np.vstack([goals, old_goals]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], step_size=0.01, plot=False, ) trpo_paths = algo.train() logger.log("labeling starts with trpo rollouts") [goals, labels] = label_states_from_paths( trpo_paths, n_traj=2, key='goal_reached', # using the min n_traj as_goal=True, env=env) paths = [path for paths in trpo_paths for path in paths] with logger.tabular_prefix("OnStarts_"): env.log_diagnostics(paths) logger.log('Generating the Heatmap...') test_and_plot_policy(policy, env, max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], bounds=v['goal_range']) plot_labeled_states(goals, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) # rollouts used for labeling (before TRPO itrs): logger.record_tabular('LabelingRollouts', k * v['n_traj'] * unif_samples) total_rollouts += k * v['n_traj'] * unif_samples logger.record_tabular('TotalLabelingRollouts', total_rollouts) logger.dump_tabular(with_prefix=False) report.new_row() # append new goals to list of all goals (replay buffer): Not the low reward ones!! filtered_raw_goals = [ goal for goal, label in zip(goals, labels) if label[0] == 1 ] all_goals.append(filtered_raw_goals)
env_spec=env.spec, # The neural network policy should have two hidden layers, each with 32 hidden units. hidden_sizes=(32, 32), init_std=10 ) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=25000, max_path_length=50, n_itr=1000, discount=0.99, step_size=0.01, # imsize=(48,48), name='reach', mode='oracle', exp_prefix="reacher_state", # force_batch_sampler=True, # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) ) run_experiment_lite( algo.train(), n_parallel=6, seed=1, )
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! if log_dir is None: log_dir = "/home/michael/" report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) inner_env = normalize(SwimmerMazeEnv()) fixed_goal_generator = FixedStateGenerator(state=v['ultimate_goal']) fixed_start_generator = FixedStateGenerator(state=v['ultimate_goal']) env = GoalStartExplorationEnv( env=inner_env, start_generator=fixed_start_generator, obs2start_transform=lambda x: x[:v['start_size']], goal_generator=fixed_goal_generator, obs2goal_transform=lambda x: x[-3:-1], terminal_eps=v['terminal_eps'], distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], inner_weight=v['inner_weight'], goal_weight=v['goal_weight'], terminate_env=True, ) policy = GaussianMLPPolicy( env_spec=env.spec, hidden_sizes=(64, 64), # Fix the variance since different goals will require different variances, making this parameter hard to learn. learn_std=v['learn_std'], adaptive_std=v['adaptive_std'], std_hidden_sizes=(16, 16), # this is only used if adaptive_std is true! output_gain=v['output_gain'], init_std=v['policy_init_std'], ) baseline = LinearFeatureBaseline(env_spec=env.spec) # load the state collection from data_upload load_dir = 'data_upload/state_collections/' # all_feasible_starts = pickle.load(open(osp.join(config.PROJECT_PATH, load_dir, 'all_feasible_states_min.pkl'), 'rb')) # print("we have %d feasible starts" % all_feasible_starts.size) all_starts = StateCollection(distance_threshold=v['coll_eps']) # brownian_starts = StateCollection(distance_threshold=v['regularize_starts']) # with env.set_kill_outside(): seed_starts = generate_starts( env, starts=[v['start_goal']], horizon=v['initial_brownian_horizon'], size=5000, # size speeds up training a bit variance=v['brownian_variance'], subsample=v['num_new_starts']) # , animated=True, speedup=1) np.random.shuffle(seed_starts) # with env.set_kill_outside(): feasible_states = find_all_feasible_states_plotting(env, seed_starts, distance_threshold=1, brownian_variance=1, animate=True) # print("hi") # show where these states are: # shuffled_starts = np.array(seed_starts.state_list) # np.random.shuffle(shuffled_starts) # generate_starts(env, starts=seed_starts, horizon=100, variance=v['brownian_variance'], animated=True, speedup=10) if 'gae_lambda' not in v: v['gae_lambda'] = 1 for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) logger.log("Sampling starts") starts = generate_starts(env, starts=seed_starts, subsample=v['num_new_starts'], size=5000, horizon=v['brownian_horizon'], variance=v['brownian_variance']) labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id'], summary_string_base='initial starts labels:\n') report.save() if v['replay_buffer'] and outer_iter > 0 and all_starts.size > 0: old_starts = all_starts.sample(v['num_old_starts']) starts = np.vstack([starts, old_starts]) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment start generator") env.update_start_generator( UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], )) logger.log("Training the algorithm") algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v['pg_batch_size'], max_path_length=v['horizon'], n_itr=v['inner_iters'], gae_lambda=v['gae_lambda'], step_size=0.01, discount=v['discount'], plot=False, ) algo.train() logger.log('Generating the Heatmap...') # policy means should not mean too much # plot_policy_means(policy, env, sampling_res=2, report=report, limit=v['goal_range'], center=v['goal_center']) test_and_plot_policy(policy, env, as_goals=False, max_reward=v['max_reward'], sampling_res=1, n_traj=v['n_traj'], itr=outer_iter, report=report, center=v['goal_center'], limit=v['goal_range']) logger.log("Labeling the starts") labels = label_states(starts, env, policy, v['horizon'], as_goals=False, n_traj=v['n_traj'], key='goal_reached') plot_labeled_states(starts, labels, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center'], maze_id=v['maze_id']) # ###### extra for deterministic: # logger.log("Labeling the goals deterministic") # with policy.set_std_to_0(): # labels_det = label_states(goals, env, policy, v['horizon'], n_traj=v['n_traj'], n_processes=1) # plot_labeled_states(goals, labels_det, report=report, itr=outer_iter, limit=v['goal_range'], center=v['goal_center']) labels = np.logical_and(labels[:, 0], labels[:, 1]).astype(int).reshape((-1, 1)) logger.dump_tabular(with_prefix=False) report.new_row() # append new states to list of all starts (replay buffer): Not the low reward ones!! filtered_raw_starts = [ start for start, label in zip(starts, labels) if label[0] == 1 ] if len( filtered_raw_starts ) > 0: # add a tone of noise if all the states I had ended up being high_reward! seed_starts = filtered_raw_starts else: seed_starts = generate_starts(env, starts=starts, horizon=v['horizon'] * 2, subsample=v['num_new_starts'], size=5000, variance=v['brownian_variance'] * 10) all_starts.append(filtered_raw_starts)