def prepare_env(env_name, visionmodel_path, **env_kwargs):
    from gym.spaces import Box

    if env_name.find('doorenv')>-1:
        expl_env = NormalizedBoxEnv(gym.make(env_name, **env_kwargs))
        xml_path = expl_env._wrapped_env.xml_path
        if env_kwargs['visionnet_input']:
            print("using vision")
            eval_env = None
            if env_kwargs['unity']:
                expl_env._wrapped_env.init()
        else:
            print("no vision")
            eval_env = NormalizedBoxEnv(gym.make(env_name, **env_kwargs))

        env_obj = expl_env._wrapped_env
        expl_env.observation_space = Box(np.zeros(env_obj.nn*2+3), np.zeros(env_obj.nn*2+3), dtype=np.float32)
        if eval_env:
            eval_env.observation_space = Box(np.zeros(env_obj.nn*2+3), np.zeros(env_obj.nn*2+3), dtype=np.float32)
    elif env_name.find("Fetch")>-1:
        env = gym.make(env_name, reward_type='sparse')
        env = wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal'])
        expl_env = NormalizedBoxEnv(env)
        eval_env = NormalizedBoxEnv(env)
        env_obj = None
    else:
        expl_env = NormalizedBoxEnv(gym.make(env_name))
        eval_env = NormalizedBoxEnv(gym.make(env_name))
        env_obj = None

    return expl_env, eval_env, env_obj
Esempio n. 2
0
def run_task(snapshot_config, *_):
    """Run task."""
    with LocalTFRunner(snapshot_config=snapshot_config) as runner:
        env = gym.make('FetchPush-v1')
        env = TfEnv(wp.FlattenDictWrapper(env, dict_keys=["observation", "desired_goal"]))

        policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(32, 32))

        baseline = LinearFeatureBaseline(env_spec=env.spec)

        algo = TRPO(env_spec=env.spec,
                    policy=policy,
                    baseline=baseline,
                    max_path_length=500,
                    discount=0.99,
                    max_kl_step=0.01)

        runner.setup(algo, env)
        runner.train(n_epochs=40, batch_size=4000)
Esempio n. 3
0
env = gym.make(ENV_NAME)

train_length = int(0.9*(len(observations)))
print(train_length)
train_obs = observations[:train_length,:]
train_acts = actions[:train_length,:]
valid_obs = observations[train_length:,:]
valid_acts = actions[train_length:,:]

print(train_obs.shape)
print(valid_obs.shape)

#ENV_NAME='Pendulum-v0'


env = wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal'])
act_dim = env.action_space.shape[0]
act_limit = env.action_space.high[0]

start_time = time.time()
train_log_dir = 'logs/'+ 'BC:'+str(start_time)
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
policy  = mlp_gaussian_policy(act_dim = actions.shape[1], act_limit = act_limit, hidden_sizes = [128,128])
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
max_ep_len = 200


# Behavioural clone this mf.
@tf.function
def train_step(obs, expert_act):
    with tf.GradientTape() as tape:
Esempio n. 4
0
def SAC(env_fn,
        ac_kwargs=dict(),
        seed=0,
        steps_per_epoch=2000,
        epochs=100,
        replay_size=int(1e6),
        gamma=0.99,
        polyak=0.995,
        lr=1e-3,
        alpha=0.2,
        batch_size=100,
        start_steps=2000,
        max_ep_len=1000,
        save_freq=1,
        load=False,
        exp_name="Experiment_1",
        render=False):
    tf.random.set_seed(seed)
    np.random.seed(seed)
    env, test_env = env_fn(), env_fn()
    test_env.render(mode='human')
    test_env.reset()

    env = wrappers.FlattenDictWrapper(
        env, dict_keys=['observation', 'desired_goal'])
    test_env = wrappers.FlattenDictWrapper(
        test_env, dict_keys=['observation', 'desired_goal'])
    # Get Env dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    act_limit = env.action_space.high[0]
    SAC = SAC_model(act_limit, obs_dim, act_dim, ac_kwargs['hidden_sizes'], lr,
                    gamma, alpha, polyak, load, exp_name)
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Logging
    start_time = time.time()
    train_log_dir = 'logs/' + exp_name + str(int(start_time))
    summary_writer = SummaryWriter(train_log_dir)

    def update_models(model, replay_buffer, steps, batch_size):
        for j in range(steps):
            batch = replay_buffer.sample_batch(batch_size)
            LossPi, LossQ1, LossQ2, LossV, Q1Vals, Q2Vals, VVals, LogPi = model.update(
                batch)

    # now collect epsiodes
    total_steps = steps_per_epoch * epochs
    steps_collected = 0

    if not load:
        # collect some initial random steps to initialise
        steps_collected += rollout_trajectories(n_steps=start_steps,
                                                env=env,
                                                max_ep_len=max_ep_len,
                                                actor='random',
                                                replay_buffer=replay_buffer,
                                                summary_writer=summary_writer,
                                                exp_name=exp_name)
        update_models(SAC,
                      replay_buffer,
                      steps=steps_collected,
                      batch_size=batch_size)

    # now act with our actor, and alternately collect data, then train.
    while steps_collected < total_steps:
        # collect an episode
        steps_collected += rollout_trajectories(
            n_steps=max_ep_len,
            env=env,
            max_ep_len=max_ep_len,
            actor=SAC.actor.get_stochastic_action,
            replay_buffer=replay_buffer,
            summary_writer=summary_writer,
            current_total_steps=steps_collected,
            exp_name=exp_name)
        # take than many training steps
        update_models(SAC,
                      replay_buffer,
                      steps=max_ep_len,
                      batch_size=batch_size)

        # if an epoch has elapsed, save and test.
        if steps_collected > 0 and steps_collected % steps_per_epoch == 0:
            SAC.save_weights()
            # Test the performance of the deterministic version of the agent.
            rollout_trajectories(n_steps=max_ep_len * 10,
                                 env=test_env,
                                 max_ep_len=max_ep_len,
                                 actor=SAC.actor.get_deterministic_action,
                                 summary_writer=summary_writer,
                                 current_total_steps=steps_collected,
                                 train=False,
                                 render=True,
                                 exp_name=exp_name)
def collect_expert(env, exp_name, n_steps, render, hierarchial, flatten,
                   max_ep_len):

    print(render)
    if hierarchial:
        pass
    else:
        if flatten:
            env = wrappers.FlattenDictWrapper(
                env, dict_keys=['observation', 'desired_goal'])
            obs_dim = env.observation_space.shape[0]
        else:
            obs_dim = env.observation_space.spaces['observation'].shape[0] + \
                env.observation_space.spaces['desired_goal'].shape[0]

        act_dim = env.action_space.shape[0]
        act_limit = env.action_space.high[0]
        # Logging
        model = SAC_model(act_limit,
                          obs_dim,
                          act_dim, [256, 256],
                          load=True,
                          exp_name=exp_name)

        episodes = rollout_trajectories(
            n_steps=n_steps,
            env=env,
            max_ep_len=max_ep_len,
            goal_based=not flatten,
            actor=model.actor.get_deterministic_action,
            train=False,
            render=render,
            exp_name=exp_name,
            return_episode=True)

        action_buff = []
        observation_buff = []
        desired_goals_buff = []
        achieved_goals_buff = []
        controllable_achieved_goal_buff = []
        full_positional_state_buff = []
        for ep in episodes['episodes']:

            # quick fix for sub-optimal demos, just don't include as they are pretty rare
            # later, go collect more?
            if ep[-1][
                    2] == 0:  # i.e, if the reward of the last transition is not -1.

                observations, actions, desired_goals, achieved_goals, controllable_achieved_goals, full_positional_states = episode_to_trajectory(
                    ep)
                action_buff.append(actions)
                observation_buff.append(observations)
                desired_goals_buff.append(desired_goals)
                achieved_goals_buff.append(achieved_goals)
                controllable_achieved_goal_buff.append(
                    controllable_achieved_goals)
                full_positional_state_buff.append(full_positional_states)
            else:
                print('Rejecting Episode')

        np.savez('collected_data/' + str(n_steps) + exp_name,
                 acts=action_buff,
                 obs=observation_buff,
                 desired_goals=desired_goals_buff,
                 achieved_goals=achieved_goals_buff,
                 controllable_achieved_goals=controllable_achieved_goal_buff,
                 full_positional_states=full_positional_state_buff)
        print('Saved at: \n collected_data/' + str(n_steps) + exp_name +
              '.npz')
Esempio n. 6
0
def experiment(variant):
    img_size = 64
    train_top10 = VisualRandomizationConfig(
        image_directory='./experiment_textures/train/top10',
        whitelist=[
            'Floor', 'Roof', 'Wall1', 'Wall2', 'Wall3', 'Wall4',
            'diningTable_visible'
        ],
        apply_arm=False,
        apply_gripper=False,
        apply_floor=True)
    expl_env = gym.make('reach_target_easy-vision-v0',
                        sparse=False,
                        img_size=img_size,
                        force_randomly_place=True,
                        force_change_position=False,
                        blank=True)
    expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation'])
    t_fn = variant["t_fn"]
    expl_env = TransformObservationWrapper(expl_env, t_fn)
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    conv_args = {
        "input_width": 64,
        "input_height": 64,
        "input_channels": 3,
        "kernel_sizes": [4, 4, 3],
        "n_channels": [32, 64, 64],
        "strides": [2, 1, 1],
        "paddings": [0, 0, 0],
        "hidden_sizes": [1024, 512],
        "batch_norm_conv": False,
        "batch_norm_fc": False,
        'init_w': 1e-4,
        "hidden_init": nn.init.orthogonal_,
        "hidden_activation": nn.ReLU(),
    }

    qf1 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    qf2 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    target_qf1 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    target_qf2 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    policy = TanhCNNPolicy(output_size=action_dim,
                           **variant['policy_kwargs'],
                           **conv_args)
    target_policy = TanhCNNPolicy(output_size=action_dim,
                                  **variant['policy_kwargs'],
                                  **conv_args)
    # es = GaussianStrategy(
    #     action_space=expl_env.action_space,
    #     max_sigma=0.3,
    #     min_sigma=0.1,  # Constant sigma
    # )

    es = GaussianAndEpislonStrategy(
        action_space=expl_env.action_space,
        epsilon=0.3,
        max_sigma=0.0,
        min_sigma=0.0,  #constant sigma 0
        decay_period=1000000)

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    expl_path_collector = MdpPathCollector(
        expl_env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=None,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=None,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 7
0
def GAIL(env_fn,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(1e6),
         gamma=0.99,
         polyak=0.995,
         lr=1e-3,
         alpha=0.2,
         batch_size=100,
         start_steps=5000,
         max_ep_len=1000,
         save_freq=1,
         load=False,
         exp_name="Experiment_1",
         render=False,
         discrim_req_acc=0.7,
         BC=False):

    tf.random.set_seed(seed)
    np.random.seed(seed)
    env, test_env = env_fn(), env_fn()
    env = wrappers.FlattenDictWrapper(
        env, dict_keys=['observation', 'desired_goal'])
    test_env = wrappers.FlattenDictWrapper(
        test_env, dict_keys=['observation', 'desired_goal'])
    # Get Env dimensions
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    SAC = SAC_model(env, obs_dim, act_dim, ac_kwargs['hidden_sizes'], lr,
                    gamma, alpha, polyak, load, exp_name)
    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    #Logging
    start_time = time.time()
    train_log_dir = 'logs/' + str(discrim_req_acc) + exp_name + ':' + str(
        start_time)
    summary_writer = tf.summary.create_file_writer(train_log_dir)

    discriminator = mlp_gail_discriminator()
    discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    if BC:
        BC_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    def update_models(model,
                      replay_buffer,
                      steps,
                      batch_size,
                      current_step=None):
        agent_accuracy = 0
        # until the discriminator is trained to sufficiently distinguish correct transitions.
        print('Updating Discriminator')
        while agent_accuracy < discrim_req_acc or expert_acurracy < discrim_req_acc:
            batch = replay_buffer.sample_batch(batch_size)
            expert_batch = sample_expert_transitions(batch_size)
            _, expert_acurracy, agent_accuracy = discriminator_train_step(
                batch, expert_batch, discriminator, discriminator_optimizer,
                replay_buffer, batch_size, discrim_req_acc)
            print(expert_acurracy, agent_accuracy)

        # now update SAC
        print('Updating Policy')
        for j in range(steps):
            batch = replay_buffer.sample_batch(batch_size)
            batch_obs, batch_acts = batch['obs1'], batch['acts']
            agent_probs = discriminator(batch_obs, batch_acts)
            agent_reward = (tf.math.log(agent_probs + 1e-8) - (
                tf.math.log(1 - agent_probs + 1e-8))).numpy().squeeze().astype(
                    'float32'
                )  #            # use GAIL reward instead of environment reward
            batch['rews'] = agent_reward

            LossPi, LossQ1, LossQ2, LossV, Q1Vals, Q2Vals, VVals, LogPi = model.train_step(
                batch)
            if BC:
                # Use BC to accelerate GAIL convergence

                expert_batch = sample_expert_transitions(batch_size)
                BC_loss = BC_step(expert_batch, model.actor, BC_optimizer)
                with summary_writer.as_default():
                    tf.summary.scalar('BC_MSE_loss',
                                      BC_loss,
                                      step=current_step + j)

    # now collect epsiodes
    total_steps = steps_per_epoch * epochs
    steps_collected = 0

    #pretrain with BC
    #pretrain_BC(SAC, BC_optimizer, batch_size)
    # collect some initial random steps to initialise
    random_steps = 5000
    steps_collected += rollout_trajectories(n_steps=random_steps,
                                            env=env,
                                            max_ep_len=max_ep_len,
                                            actor='random',
                                            replay_buffer=replay_buffer,
                                            summary_writer=summary_writer,
                                            exp_name=exp_name)

    update_models(SAC,
                  replay_buffer,
                  steps=random_steps,
                  batch_size=batch_size,
                  current_step=steps_collected)

    # now act with our actor, and alternately collect data, then train.
    while steps_collected < total_steps:
        # collect an episode
        steps_collected += rollout_trajectories(
            n_steps=max_ep_len,
            env=env,
            max_ep_len=max_ep_len,
            actor=SAC.actor.get_stochastic_action,
            replay_buffer=replay_buffer,
            summary_writer=summary_writer,
            current_total_steps=steps_collected,
            exp_name=exp_name)
        # take than many training steps
        update_models(SAC,
                      replay_buffer,
                      steps=max_ep_len,
                      batch_size=batch_size,
                      current_step=steps_collected)

        # if an epoch has elapsed, save and test.
        if steps_collected > 0 and steps_collected % steps_per_epoch == 0:
            SAC.save_weights()
            # Test the performance of the deterministic version of the agent.
            rollout_trajectories(n_steps=max_ep_len * 10,
                                 env=test_env,
                                 max_ep_len=max_ep_len,
                                 actor=SAC.actor.get_deterministic_action,
                                 summary_writer=summary_writer,
                                 current_total_steps=steps_collected,
                                 train=False,
                                 render=True,
                                 exp_name=exp_name)
Esempio n. 8
0
def experiment(variant):
    expl_env = envs[variant['env']](variant['dr'])
    expl_env = wrappers.FlattenDictWrapper(expl_env, dict_keys=['observation'])
    t_fn = variant["t_fn"]
    expl_env = TransformObservationWrapper(expl_env, t_fn)
    action_dim = expl_env.action_space.low.size
    conv_args = {
        "input_width": 16,
        "input_height": 16,
        "input_channels": 8,
        "kernel_sizes": [4],
        "n_channels": [32],
        "strides": [4],
        "paddings": [0],
        "hidden_sizes": [1024, 512],
        "batch_norm_conv": False,
        "batch_norm_fc": False,
        'init_w': 1e-4,
        "hidden_init": nn.init.orthogonal_,
        "hidden_activation": nn.ReLU(),
    }

    qf1 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    qf2 = FlattenCNN(output_size=1,
                     added_fc_input_size=action_dim,
                     **variant['qf_kwargs'],
                     **conv_args)
    target_qf1 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    target_qf2 = FlattenCNN(output_size=1,
                            added_fc_input_size=action_dim,
                            **variant['qf_kwargs'],
                            **conv_args)
    policy = TanhCNNPolicy(output_size=action_dim,
                           **variant['policy_kwargs'],
                           **conv_args)
    target_policy = TanhCNNPolicy(output_size=action_dim,
                                  **variant['policy_kwargs'],
                                  **conv_args)
    if variant['noise'] == "eps":
        es = GaussianAndEpislonStrategy(
            action_space=expl_env.action_space,
            epsilon=0.3,
            max_sigma=0.0,
            min_sigma=0.0,  #constant sigma 0
            decay_period=1000000)
    elif variant['noise'] == "gaussian":
        es = GaussianStrategy(action_space=expl_env.action_space,
                              max_sigma=0.3,
                              min_sigma=0.1,
                              decay_period=1000000)
    else:
        print("unsupported param for --noise")
        assert False
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    expl_path_collector = MdpPathCollector(
        expl_env,
        exploration_policy,
    )
    replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env)
    trainer = TD3Trainer(policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         target_policy=target_policy,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=None,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=None,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 9
0
    def run_games_for_agent(self, agent_class):
        """Runs a set of games for a given agent, saving the results in self.results"""
        # Stores this agent's results.
        agent_results = []
        agent_name = agent_class.agent_name
        agent_group = self.agent_to_agent_group[agent_name]
        agent_round = 1

        # For every game the agent needs to run.
        for run in range(self.config.runs_per_agent):
            # Copy configurations to be provided to agent.
            agent_config = copy.deepcopy(self.config)

            # If the env is changeable, meaning that different episodes can have different goals.
            if self.environment_has_changeable_goals(agent_config.environment) and \
                    self.agent_cant_handle_changeable_goals_without_flattening(agent_name):
                print("Flattening changeable-goal environment for agent {}".
                      format(agent_name))
                agent_config.environment = wrappers.FlattenDictWrapper(
                    agent_config.environment,
                    dict_keys=["observation", "desired_goal"])

            # Generate random seed for agent based on config.
            if self.config.randomise_random_seed:
                agent_config.seed = random.randint(0, 2**32 - 2)

            # Get specific configurations given the agent's type.
            agent_config.hyperparameters = agent_config.hyperparameters

            # Print some debug information.
            print("AGENT NAME: {}".format(agent_name))
            print("\033[1m" + "{}: {}".format(agent_round, agent_name) +
                  "\033[0m",
                  flush=True)

            # Instantiate agent with the given agent-type configurations.
            agent = agent_class(agent_config)

            # Get env name.
            self.environment_name = agent.environment_title

            # Print agent's hyperparameters and seed.
            print(agent.hyperparameters)
            print("RANDOM SEED ", agent_config.seed)

            # Run episodes (n is specified in config as "num_episodes_to_run")
            game_scores, rolling_scores, time_taken = agent.run_n_episodes()

            # Print run time.
            print("Time taken: {}".format(time_taken), flush=True)
            self.print_two_empty_lines()

            # Append results to this agent's result list.
            agent_results.append([
                game_scores, rolling_scores,
                len(rolling_scores), -1 * max(rolling_scores), time_taken
            ])

            # Finally, increment agent run counter.
            agent_round += 1

        # After agent's run is over, append results to results dictionary.
        self.results[agent_name] = agent_results