def prepare_env(env_name, visionmodel_path, **env_kwargs): from gym.spaces import Box if env_name.find('doorenv')>-1: expl_env = NormalizedBoxEnv(gym.make(env_name, **env_kwargs)) xml_path = expl_env._wrapped_env.xml_path if env_kwargs['visionnet_input']: print("using vision") eval_env = None if env_kwargs['unity']: expl_env._wrapped_env.init() else: print("no vision") eval_env = NormalizedBoxEnv(gym.make(env_name, **env_kwargs)) env_obj = expl_env._wrapped_env expl_env.observation_space = Box(np.zeros(env_obj.nn*2+3), np.zeros(env_obj.nn*2+3), dtype=np.float32) if eval_env: eval_env.observation_space = Box(np.zeros(env_obj.nn*2+3), np.zeros(env_obj.nn*2+3), dtype=np.float32) elif env_name.find("Fetch")>-1: env = gym.make(env_name, reward_type='sparse') env = wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal']) expl_env = NormalizedBoxEnv(env) eval_env = NormalizedBoxEnv(env) env_obj = None else: expl_env = NormalizedBoxEnv(gym.make(env_name)) eval_env = NormalizedBoxEnv(gym.make(env_name)) env_obj = None return expl_env, eval_env, env_obj
def run_task(snapshot_config, *_): """Run task.""" with LocalTFRunner(snapshot_config=snapshot_config) as runner: env = gym.make('FetchPush-v1') env = TfEnv(wp.FlattenDictWrapper(env, dict_keys=["observation", "desired_goal"])) policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32)) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=500, discount=0.99, max_kl_step=0.01) runner.setup(algo, env) runner.train(n_epochs=40, batch_size=4000)
env = gym.make(ENV_NAME) train_length = int(0.9*(len(observations))) print(train_length) train_obs = observations[:train_length,:] train_acts = actions[:train_length,:] valid_obs = observations[train_length:,:] valid_acts = actions[train_length:,:] print(train_obs.shape) print(valid_obs.shape) #ENV_NAME='Pendulum-v0' env = wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal']) act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] start_time = time.time() train_log_dir = 'logs/'+ 'BC:'+str(start_time) train_summary_writer = tf.summary.create_file_writer(train_log_dir) policy = mlp_gaussian_policy(act_dim = actions.shape[1], act_limit = act_limit, hidden_sizes = [128,128]) optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4) max_ep_len = 200 # Behavioural clone this mf. @tf.function def train_step(obs, expert_act): with tf.GradientTape() as tape:
def SAC(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=2000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=2000, max_ep_len=1000, save_freq=1, load=False, exp_name="Experiment_1", render=False): tf.random.set_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() test_env.render(mode='human') test_env.reset() env = wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'desired_goal']) test_env = wrappers.FlattenDictWrapper( test_env, dict_keys=['observation', 'desired_goal']) # Get Env dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] SAC = SAC_model(act_limit, obs_dim, act_dim, ac_kwargs['hidden_sizes'], lr, gamma, alpha, polyak, load, exp_name) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Logging start_time = time.time() train_log_dir = 'logs/' + exp_name + str(int(start_time)) summary_writer = SummaryWriter(train_log_dir) def update_models(model, replay_buffer, steps, batch_size): for j in range(steps): batch = replay_buffer.sample_batch(batch_size) LossPi, LossQ1, LossQ2, LossV, Q1Vals, Q2Vals, VVals, LogPi = model.update( batch) # now collect epsiodes total_steps = steps_per_epoch * epochs steps_collected = 0 if not load: # collect some initial random steps to initialise steps_collected += rollout_trajectories(n_steps=start_steps, env=env, max_ep_len=max_ep_len, actor='random', replay_buffer=replay_buffer, summary_writer=summary_writer, exp_name=exp_name) update_models(SAC, replay_buffer, steps=steps_collected, batch_size=batch_size) # now act with our actor, and alternately collect data, then train. while steps_collected < total_steps: # collect an episode steps_collected += rollout_trajectories( n_steps=max_ep_len, env=env, max_ep_len=max_ep_len, actor=SAC.actor.get_stochastic_action, replay_buffer=replay_buffer, summary_writer=summary_writer, current_total_steps=steps_collected, exp_name=exp_name) # take than many training steps update_models(SAC, replay_buffer, steps=max_ep_len, batch_size=batch_size) # if an epoch has elapsed, save and test. if steps_collected > 0 and steps_collected % steps_per_epoch == 0: SAC.save_weights() # Test the performance of the deterministic version of the agent. rollout_trajectories(n_steps=max_ep_len * 10, env=test_env, max_ep_len=max_ep_len, actor=SAC.actor.get_deterministic_action, summary_writer=summary_writer, current_total_steps=steps_collected, train=False, render=True, exp_name=exp_name)
def collect_expert(env, exp_name, n_steps, render, hierarchial, flatten, max_ep_len): print(render) if hierarchial: pass else: if flatten: env = wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'desired_goal']) obs_dim = env.observation_space.shape[0] else: obs_dim = env.observation_space.spaces['observation'].shape[0] + \ env.observation_space.spaces['desired_goal'].shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] # Logging model = SAC_model(act_limit, obs_dim, act_dim, [256, 256], load=True, exp_name=exp_name) episodes = rollout_trajectories( n_steps=n_steps, env=env, max_ep_len=max_ep_len, goal_based=not flatten, actor=model.actor.get_deterministic_action, train=False, render=render, exp_name=exp_name, return_episode=True) action_buff = [] observation_buff = [] desired_goals_buff = [] achieved_goals_buff = [] controllable_achieved_goal_buff = [] full_positional_state_buff = [] for ep in episodes['episodes']: # quick fix for sub-optimal demos, just don't include as they are pretty rare # later, go collect more? if ep[-1][ 2] == 0: # i.e, if the reward of the last transition is not -1. observations, actions, desired_goals, achieved_goals, controllable_achieved_goals, full_positional_states = episode_to_trajectory( ep) action_buff.append(actions) observation_buff.append(observations) desired_goals_buff.append(desired_goals) achieved_goals_buff.append(achieved_goals) controllable_achieved_goal_buff.append( controllable_achieved_goals) full_positional_state_buff.append(full_positional_states) else: print('Rejecting Episode') np.savez('collected_data/' + str(n_steps) + exp_name, acts=action_buff, obs=observation_buff, desired_goals=desired_goals_buff, achieved_goals=achieved_goals_buff, controllable_achieved_goals=controllable_achieved_goal_buff, full_positional_states=full_positional_state_buff) print('Saved at: \n collected_data/' + str(n_steps) + exp_name + '.npz')
def experiment(variant): img_size = 64 train_top10 = VisualRandomizationConfig( image_directory='./experiment_textures/train/top10', whitelist=[ 'Floor', 'Roof', 'Wall1', 'Wall2', 'Wall3', 'Wall4', 'diningTable_visible' ], apply_arm=False, apply_gripper=False, apply_floor=True) expl_env = gym.make('reach_target_easy-vision-v0', sparse=False, img_size=img_size, force_randomly_place=True, force_change_position=False, blank=True) expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation']) t_fn = variant["t_fn"] expl_env = TransformObservationWrapper(expl_env, t_fn) obs_dim = expl_env.observation_space.low.size action_dim = expl_env.action_space.low.size conv_args = { "input_width": 64, "input_height": 64, "input_channels": 3, "kernel_sizes": [4, 4, 3], "n_channels": [32, 64, 64], "strides": [2, 1, 1], "paddings": [0, 0, 0], "hidden_sizes": [1024, 512], "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) target_policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) # es = GaussianStrategy( # action_space=expl_env.action_space, # max_sigma=0.3, # min_sigma=0.1, # Constant sigma # ) es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, epsilon=0.3, max_sigma=0.0, min_sigma=0.0, #constant sigma 0 decay_period=1000000) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=None, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def GAIL(env_fn, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=1e-3, alpha=0.2, batch_size=100, start_steps=5000, max_ep_len=1000, save_freq=1, load=False, exp_name="Experiment_1", render=False, discrim_req_acc=0.7, BC=False): tf.random.set_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = wrappers.FlattenDictWrapper( env, dict_keys=['observation', 'desired_goal']) test_env = wrappers.FlattenDictWrapper( test_env, dict_keys=['observation', 'desired_goal']) # Get Env dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] SAC = SAC_model(env, obs_dim, act_dim, ac_kwargs['hidden_sizes'], lr, gamma, alpha, polyak, load, exp_name) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) #Logging start_time = time.time() train_log_dir = 'logs/' + str(discrim_req_acc) + exp_name + ':' + str( start_time) summary_writer = tf.summary.create_file_writer(train_log_dir) discriminator = mlp_gail_discriminator() discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) if BC: BC_optimizer = tf.keras.optimizers.Adam(learning_rate=lr) def update_models(model, replay_buffer, steps, batch_size, current_step=None): agent_accuracy = 0 # until the discriminator is trained to sufficiently distinguish correct transitions. print('Updating Discriminator') while agent_accuracy < discrim_req_acc or expert_acurracy < discrim_req_acc: batch = replay_buffer.sample_batch(batch_size) expert_batch = sample_expert_transitions(batch_size) _, expert_acurracy, agent_accuracy = discriminator_train_step( batch, expert_batch, discriminator, discriminator_optimizer, replay_buffer, batch_size, discrim_req_acc) print(expert_acurracy, agent_accuracy) # now update SAC print('Updating Policy') for j in range(steps): batch = replay_buffer.sample_batch(batch_size) batch_obs, batch_acts = batch['obs1'], batch['acts'] agent_probs = discriminator(batch_obs, batch_acts) agent_reward = (tf.math.log(agent_probs + 1e-8) - ( tf.math.log(1 - agent_probs + 1e-8))).numpy().squeeze().astype( 'float32' ) # # use GAIL reward instead of environment reward batch['rews'] = agent_reward LossPi, LossQ1, LossQ2, LossV, Q1Vals, Q2Vals, VVals, LogPi = model.train_step( batch) if BC: # Use BC to accelerate GAIL convergence expert_batch = sample_expert_transitions(batch_size) BC_loss = BC_step(expert_batch, model.actor, BC_optimizer) with summary_writer.as_default(): tf.summary.scalar('BC_MSE_loss', BC_loss, step=current_step + j) # now collect epsiodes total_steps = steps_per_epoch * epochs steps_collected = 0 #pretrain with BC #pretrain_BC(SAC, BC_optimizer, batch_size) # collect some initial random steps to initialise random_steps = 5000 steps_collected += rollout_trajectories(n_steps=random_steps, env=env, max_ep_len=max_ep_len, actor='random', replay_buffer=replay_buffer, summary_writer=summary_writer, exp_name=exp_name) update_models(SAC, replay_buffer, steps=random_steps, batch_size=batch_size, current_step=steps_collected) # now act with our actor, and alternately collect data, then train. while steps_collected < total_steps: # collect an episode steps_collected += rollout_trajectories( n_steps=max_ep_len, env=env, max_ep_len=max_ep_len, actor=SAC.actor.get_stochastic_action, replay_buffer=replay_buffer, summary_writer=summary_writer, current_total_steps=steps_collected, exp_name=exp_name) # take than many training steps update_models(SAC, replay_buffer, steps=max_ep_len, batch_size=batch_size, current_step=steps_collected) # if an epoch has elapsed, save and test. if steps_collected > 0 and steps_collected % steps_per_epoch == 0: SAC.save_weights() # Test the performance of the deterministic version of the agent. rollout_trajectories(n_steps=max_ep_len * 10, env=test_env, max_ep_len=max_ep_len, actor=SAC.actor.get_deterministic_action, summary_writer=summary_writer, current_total_steps=steps_collected, train=False, render=True, exp_name=exp_name)
def experiment(variant): expl_env = envs[variant['env']](variant['dr']) expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation']) t_fn = variant["t_fn"] expl_env = TransformObservationWrapper(expl_env, t_fn) action_dim = expl_env.action_space.low.size conv_args = { "input_width": 16, "input_height": 16, "input_channels": 8, "kernel_sizes": [4], "n_channels": [32], "strides": [4], "paddings": [0], "hidden_sizes": [1024, 512], "batch_norm_conv": False, "batch_norm_fc": False, 'init_w': 1e-4, "hidden_init": nn.init.orthogonal_, "hidden_activation": nn.ReLU(), } qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf1 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) target_qf2 = FlattenCNN(output_size=1, added_fc_input_size=action_dim, **variant['qf_kwargs'], **conv_args) policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) target_policy = TanhCNNPolicy(output_size=action_dim, **variant['policy_kwargs'], **conv_args) if variant['noise'] == "eps": es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, epsilon=0.3, max_sigma=0.0, min_sigma=0.0, #constant sigma 0 decay_period=1000000) elif variant['noise'] == "gaussian": es = GaussianStrategy(action_space=expl_env.action_space, max_sigma=0.3, min_sigma=0.1, decay_period=1000000) else: print("unsupported param for --noise") assert False exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) expl_path_collector = MdpPathCollector( expl_env, exploration_policy, ) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = TD3Trainer(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=None, exploration_data_collector=expl_path_collector, evaluation_data_collector=None, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def run_games_for_agent(self, agent_class): """Runs a set of games for a given agent, saving the results in self.results""" # Stores this agent's results. agent_results = [] agent_name = agent_class.agent_name agent_group = self.agent_to_agent_group[agent_name] agent_round = 1 # For every game the agent needs to run. for run in range(self.config.runs_per_agent): # Copy configurations to be provided to agent. agent_config = copy.deepcopy(self.config) # If the env is changeable, meaning that different episodes can have different goals. if self.environment_has_changeable_goals(agent_config.environment) and \ self.agent_cant_handle_changeable_goals_without_flattening(agent_name): print("Flattening changeable-goal environment for agent {}". format(agent_name)) agent_config.environment = wrappers.FlattenDictWrapper( agent_config.environment, dict_keys=["observation", "desired_goal"]) # Generate random seed for agent based on config. if self.config.randomise_random_seed: agent_config.seed = random.randint(0, 2**32 - 2) # Get specific configurations given the agent's type. agent_config.hyperparameters = agent_config.hyperparameters # Print some debug information. print("AGENT NAME: {}".format(agent_name)) print("\033[1m" + "{}: {}".format(agent_round, agent_name) + "\033[0m", flush=True) # Instantiate agent with the given agent-type configurations. agent = agent_class(agent_config) # Get env name. self.environment_name = agent.environment_title # Print agent's hyperparameters and seed. print(agent.hyperparameters) print("RANDOM SEED ", agent_config.seed) # Run episodes (n is specified in config as "num_episodes_to_run") game_scores, rolling_scores, time_taken = agent.run_n_episodes() # Print run time. print("Time taken: {}".format(time_taken), flush=True) self.print_two_empty_lines() # Append results to this agent's result list. agent_results.append([ game_scores, rolling_scores, len(rolling_scores), -1 * max(rolling_scores), time_taken ]) # Finally, increment agent run counter. agent_round += 1 # After agent's run is over, append results to results dictionary. self.results[agent_name] = agent_results