Esempio n. 1
0
 def __init__(self,
              *args,
              observation_key=None,
              desired_goal_key=None,
              **kwargs):
     HER.__init__(
         self,
         observation_key=observation_key,
         desired_goal_key=desired_goal_key,
     )
     SoftActorCritic.__init__(self, *args, **kwargs)
     assert isinstance(self.replay_buffer, ObsDictRelabelingBuffer)
Esempio n. 2
0
 def __init__(
         self,
         *args,
         her_kwargs,
         sac_kwargs,
         **kwargs
 ):
     HER.__init__(self, **her_kwargs)
     SoftActorCritic.__init__(self, *args, **kwargs, **sac_kwargs)
     assert isinstance(
         self.replay_buffer, RelabelingReplayBuffer
     ) or isinstance(
         self.replay_buffer, ObsDictRelabelingBuffer
     )
Esempio n. 3
0
def experiment(variant):
    #env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    env = NormalizedBoxEnv(gym.make('Pointmass-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 4
0
File: sac.py Progetto: jcoreyes/erl
def experiment(variant):
    env = SawyerXYZEnv(**variant['env_kwargs'])
    env = MultitaskToFlatEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = ConcatMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    vf = ConcatMlp(
        input_size=obs_dim,
        output_size=1,
        **variant['vf_kwargs']
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        **variant['policy_kwargs']
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 5
0
def experiment(variant):
    env = NormalizedBoxEnv(CartpoleSwingupSparseEnv())
    #env = NormalizedBoxEnv(HalfCheetahEnv())
    #env = NormalizedBoxEnv(Continuous_MountainCarEnv())
    #env = DIAYNWrappedEnv(NormalizedBoxEnv(HumanoidEnv()))
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    skill_dim = 0  #50
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + skill_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + skill_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + skill_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + skill_dim,
        action_dim=action_dim,
        #k=4,
    )
    disc = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=skill_dim if skill_dim > 0 else 1,
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        #disc=disc,
        #skill_dim=skill_dim,
        **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 6
0
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # env = NormalizedBoxEnv(InvertedPendulumEnv())
    # ---------
    # env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))
    # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))

    env = ReacherEnv()
    training_env = ReacherEnv()
    
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    total_meta_variable_dim = 0
    for dims in exp_specs['true_meta_variable_dims']:
        total_meta_variable_dim += sum(dims)

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim + total_meta_variable_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + total_meta_variable_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + total_meta_variable_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=training_env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Esempio n. 7
0
def experiment(variant, env_name, record_name, record_every_episode):
    #env = CartPoleEnv()
    env = gym.make(env_name)
    # A workaround to give this info later on
    # (Such naughty business...)
    randomize_settings = {
        "turnframes": [10, 10],
        "engagement_distance": [100, 200]
    }
    env.record_name = record_name
    env.record_every_episode = record_every_episode
    env.randomize_settings = randomize_settings
    env = OneHotsToDecimalsAndRecordAndRandomize(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    num_categoricals = len(env.action_space.nvec)
    num_categories = env.action_space.nvec[0]

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        # Action is fed in as a raveled one-hot vector
        input_size=obs_dim + int(np.sum(env.action_space.nvec)),
        output_size=1,
        hidden_activation=F.sigmoid,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
        hidden_activation=F.sigmoid,
    )

    # For multi-discrete
    policy = MultiCategoricalPolicy(hidden_sizes=[net_size, net_size],
                                    obs_dim=obs_dim,
                                    num_categoricals=num_categoricals,
                                    num_categories=num_categories,
                                    hidden_activation=F.sigmoid)

    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 8
0
def experiment(variant):
    env = NormalizedBoxEnv(
        MultiGoalEnv(
            actuation_cost_coeff=10,
            distance_cost_coeff=1,
            goal_reward=10,
        ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[100, 100],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    plotter = QFPolicyPlotter(qf=qf,
                              policy=policy,
                              obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0],
                                                [2.5, 2.5]]),
                              default_action=[np.nan, np.nan],
                              n_samples=100)
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        # plotter=plotter,
        # render_eval_paths=True,
        **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 9
0
def experiment(variant):
    env = variant['env_class']()
    env = NormalizedBoxEnv(env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = ConcatMlp(input_size=obs_dim + action_dim,
                   output_size=1,
                   **variant['qf_kwargs'])
    vf = ConcatMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 10
0
def experiment(variant):

    farmlist_base = [('123.123.123.123', 4)]

    farmer = Farmer(farmlist_base)
    environment = acq_remote_env(farmer)
    env = NormalizedBoxEnv(environment)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        environment_farming=True,
        farmlist_base=farmlist_base,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def experiment(variant):
    ptu._use_gpu = variant['gpu']
    env = NormalizedBoxEnv(gym.make(variant['env_name']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']

    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = SoftActorCritic(
        env=env,
        training_env=env,
        save_environment=False,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()

    algorithm.train()

    return algorithm
def experiment(variant):
    logger.add_text_output('./d_text.txt')
    logger.add_tabular_output('./d_tabular.txt')
    logger.set_snapshot_dir('./snaps')
    farmer = Farmer([('0.0.0.0', 1)])
    remote_env = farmer.force_acq_env()
    remote_env.set_spaces()
    env = NormalizedBoxEnv(remote_env)

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                training_env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 13
0
def experiment(variant):
    # env = normalize(GymEnv(
    #     'HalfCheetah-v1',
    #     force_reset=True,
    #     record_video=False,
    #     record_log=False,
    # ))
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 14
0
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))
    # env = gym.make('HalfCheetah-v2')

    env = MujocoManipEnv("SawyerBinsCanEnv")  # wrap as a gym env
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
        action_skip=ACTION_SKIP,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 15
0
def experiment(variant):
    env = NormalizedBoxEnv(MultiGoalEnv(
        actuation_cost_coeff=10,
        distance_cost_coeff=1,
        goal_reward=10,
    ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    qf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = ConcatMlp(
        hidden_sizes=[100, 100],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[100, 100],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    with torch.autograd.profiler.profile() as prof:
        algorithm.train()
    prof.export_chrome_trace("tmp-torch-chrome-trace.prof")
Esempio n. 16
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the discriminator model
    disc_model = MLPDisc(
        obs_dim +
        action_dim if not variant['adv_irl_params']['state_only'] else 2 *
        obs_dim,
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = AdvIRL(env=env,
                       training_env=training_env,
                       exploration_policy=policy,
                       discriminator=disc_model,
                       policy_trainer=trainer,
                       expert_replay_buffer=expert_replay_buffer,
                       **variant['adv_irl_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Esempio n. 17
0
def experiment(log_dir, variant_overwrite, cpu=False):
    if not cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    # Load experiment from file.
    env, _, data, variant = load_experiment(log_dir, variant_overwrite)
    assert all([
        a == b
        for a, b in zip(env.sampled_goal, variant['env_kwargs']['goal_prior'])
    ])

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format(
        variant['algo_kwargs']['num_episodes'],
        variant['algo_kwargs']['max_path_length'],
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        variant['algo_kwargs']['reward_scale'],
        variant['historical_policies_kwargs']['num_historical_policies'],
    )
    exp_id = create_exp_name(exp_id)
    out_dir = os.path.join(log_dir, exp_id)
    print('Logging to:', out_dir)
    setup_logger(
        log_dir=out_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if variant['intrinsic_reward'] == 'smm':
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if variant['historical_policies_kwargs']['num_historical_policies'] > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            log_dir=log_dir,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 18
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the discriminator model
    if variant['disc_model_type'] == 'resnet_disc':
        disc_model = ResNetAIRLDisc(
            len(state_indices),
            num_layer_blocks=variant['disc_num_blocks'],
            hid_dim=variant['disc_hid_dim'],
            hid_act=variant['disc_hid_act'],
            use_bn=variant['disc_use_bn'],
            clamp_magnitude=variant['disc_clamp_magnitude'])
    else:
        disc_model = MLPDisc(len(state_indices),
                             num_layer_blocks=variant['disc_num_blocks'],
                             hid_dim=variant['disc_hid_dim'],
                             hid_act=variant['disc_hid_act'],
                             use_bn=variant['disc_use_bn'],
                             clamp_magnitude=variant['disc_clamp_magnitude'])

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = AdvSMM(env=env,
                       training_env=training_env,
                       exploration_policy=policy,
                       discriminator=disc_model,
                       policy_trainer=trainer,
                       target_state_buffer=target_state_buffer,
                       state_indices=state_indices,
                       **variant['adv_smm_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Esempio n. 19
0
    def __init__(self, use_history, SMM_path, num_skills):
        self.use_history = use_history
        log_dir = SMM_path
        self.num_skills = num_skills
        from rlkit.torch.sac.sac import SoftActorCritic
        self.config = dict(
            env_kwargs=dict(
                goal_prior=[1.12871704, 0.46767739,
                            0.42],  # Test-time object goal position
                sample_goal=False,
                shaped_rewards=[
                    'object_off_table', 'object_goal_indicator',
                    'object_gripper_indicator', 'action_penalty'
                ],
                terminate_upon_success=False,
                terminate_upon_failure=False,
            ),
            test_goal=[1.12871704, 0.46767739, 0.42],
            algo_kwargs=dict(
                max_path_length=50,  # Maximum path length in the environment
                num_episodes=100,  # Number of test episodes
                reward_scale=
                100,  # Weight of the extrinsic reward relative to the SAC reward
                collection_mode='episodic',  # Each epoch is one episode
                num_updates_per_episode=
                0,  # Evaluate without additional training
            ),
            smm_kwargs=dict(
                update_p_z_prior_coeff=
                1,  # p(z) coeff for SMM posterior adaptation (higher value corresponds to more uniform p(z))

                # Turn off SMM reward.
                state_entropy_coeff=0,
                latent_entropy_coeff=0,
                latent_conditional_entropy_coeff=0,
                discriminator_lr=0,
            ),
        )
        with open('/home/zj/Desktop/sample/smm/configs/test_no_ha_point.json'
                  ) as f:
            exp_params = json.load(f)
        overwrite_dict(self.config, exp_params)

        ptu.set_gpu_mode(True)
        env, _, data, variant = load_experiment(log_dir, self.config)

        variant['historical_policies_kwargs'][
            'num_historical_policies'] = 10 if self.use_history else 0
        self.policy = data['policy']

        vf = data['vf']
        qf = data['qf']
        self.algorithm = SoftActorCritic(
            env=env,
            training_env=env,  # can't clone box2d env cause of swig
            save_environment=False,  # can't save box2d env cause of swig
            policy=self.policy,
            qf=qf,
            vf=vf,
            **variant['algo_kwargs'],
        )
        self.policy.to('cuda')
        if variant['intrinsic_reward'] == 'smm':
            discriminator = data['discriminator']
            density_model = data['density_model']
            SMMHook(base_algorithm=self.algorithm,
                    discriminator=discriminator,
                    density_model=density_model,
                    **variant['smm_kwargs'])

        # Overwrite algorithm for historical averaging.
        if variant['historical_policies_kwargs']['num_historical_policies'] > 0:
            HistoricalPoliciesHook(
                base_algorithm=self.algorithm,
                log_dir=log_dir,
                **variant['historical_policies_kwargs'],
            )
def experiment(log_dir, variant_overwrite, cpu=False):
    if not cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    # Load experiment from file.
    env, _, data, variant = load_experiment(log_dir, variant_overwrite)
    #assert all([a == b for a, b in zip(print(samples)env.sampled_goal, variant['env_kwargs']['goal_prior'])])

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}'.format(
        variant['algo_kwargs']['num_episodes'],
        variant['algo_kwargs']['max_path_length'],
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        variant['algo_kwargs']['reward_scale'],
        variant['historical_policies_kwargs']['num_historical_policies'],
    )
    exp_id = create_exp_name(exp_id)
    out_dir = os.path.join(log_dir, exp_id)
    print('Logging to:', out_dir)
    setup_logger(
        log_dir=out_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if variant['intrinsic_reward'] == 'smm':
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if variant['historical_policies_kwargs']['num_historical_policies'] > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            log_dir=log_dir,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    #algorithm.train()
    samples = algorithm.get_eval_paths()
    #for path in samples:
    #    print(path['observations'])

    #plt.figure()
    #plt.plot(samples[0]['observations'][:, 0], samples[0]['observations'][:, 1])
    #plt.plot(3, 2)
    #plt.show()
    print(env.reset())
    print(samples[0]['observations'])
    i = 0
    for path in samples:

        np.save('./outtem/out%i.npy' % i, path['observations'])
        i = i + 1
    #print(algorithm.policy.get_action(np.array([0,0])))
    from rlkit.samplers.util import rollout
    from rlkit.samplers.in_place import InPlacePathSampler
    #path=rollout(env,algorithm.eval_policy,50)
    eval_sampler = InPlacePathSampler(
        env=env,
        policy=algorithm.eval_policy,
        max_samples=100,
        max_path_length=50,
    )
    path = algorithm.eval_sampler.obtain_samples()
    print(path[0]['observations'])
Esempio n. 21
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    if 'minmax_env_with_demo_stats' in variant.keys():
        if variant['minmax_env_with_demo_stats']:
            print('Use minmax envs')
            assert 'norm_train' in buffer_save_dict.keys()
            expert_replay_buffer = buffer_save_dict['norm_train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
    elif variant['minmax_env_with_demo_stats']:
        env = MinmaxEnv(
            env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )
        training_env = MinmaxEnv(
            training_env,
            obs_min=buffer_save_dict['obs_min'],
            obs_max=buffer_save_dict['obs_max'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the energy model
    if variant['ebil_params']['mode'] == 'deen':
        """
        ebm_model = MLPEBM(
            obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude'],
        )
        """
        ebm_exp_name = 'ebm-deen-' + variant['env_specs'][
            'env_name'] + '-' + str(
                variant['expert_traj_num']) + '-train--sigma-' + str(
                    variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir

        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    elif variant['ebil_params']['mode'] == 'ae':
        ebm_exp_name = 'ebm-ae-' + variant['env_specs']['env_name'] + '-' + str(
            variant['expert_traj_num']) + '-train--sigma-' + str(
                variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        ebm_id_dirs = os.listdir(ebm_dir)
        ebm_id_dirs = sorted(
            ebm_id_dirs,
            key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x)))

        load_ebm_dir = os.path.join(
            ebm_dir, ebm_id_dirs[-1])  # Choose the last as the load ebm dir

        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    else:
        raise NotImplementedError

    print("loaded EBM from {}".format(load_ebm_path))

    # Test
    if variant['test']:
        batch_data = expert_replay_buffer.random_batch(
            100, keys=['observations', 'actions'])
        print('ebm_obs: ', np.mean(batch_data['observations'], axis=0))
        obs = torch.Tensor(batch_data['observations'])
        acts = torch.Tensor(batch_data['actions'])
        exp_input = torch.cat([obs, acts], dim=1).to(ptu.device)
        print("Not expert data", ebm_model(exp_input * 200).mean().item())
        print("Expert data", ebm_model(exp_input).mean().item())
        exit(1)

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm_pretrain = BC(env=env,
                            training_env=training_env,
                            exploration_policy=policy,
                            expert_replay_buffer=expert_replay_buffer,
                            **variant['bc_params'])
    algorithm = EBIL(env=env,
                     training_env=training_env,
                     exploration_policy=policy,
                     rew_func=variant['rew_func'],
                     cons=variant['cons'],
                     ebm=ebm_model,
                     policy_trainer=trainer,
                     expert_replay_buffer=expert_replay_buffer,
                     **variant['ebil_params'])

    if ptu.gpu_enabled():
        algorithm_pretrain.to(ptu.device)
        algorithm.to(ptu.device)
    else:
        algorithm_pretrain.to('cpu')
        algorithm.to('cpu')
    if variant['pretrain']:
        algorithm_pretrain.train()

    algorithm.train()

    return 1
Esempio n. 22
0
def experiment(variant):
    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1
    
    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    net_size = variant['net_size']
    num_hidden = variant['num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    
    trainer = SoftActorCritic(
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['sac_params']
    )
    algorithm = TorchRLAlgorithm(
        trainer=trainer,
        env=env,
        training_env=training_env,
        exploration_policy=policy,
        **variant['rl_alg_params']
    )

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Esempio n. 23
0
def experiment(variant):
    intrinsic_reward = variant['intrinsic_reward']

    # Create environment.
    num_skills = variant['smm_kwargs']['num_skills'] if variant[
        'intrinsic_reward'] == 'smm' else 0
    env, training_env = create_env(variant['env_id'], variant['env_kwargs'],
                                   num_skills)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    # Initialize networks.
    net_size = variant['net_size']
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        hidden_sizes=[net_size, net_size],
        output_size=1,
    )
    vf = FlattenMlp(
        input_size=obs_dim,
        hidden_sizes=[net_size, net_size],
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        hidden_sizes=[net_size, net_size],
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=training_env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'])

    if intrinsic_reward == 'smm':
        discriminator = FlattenMlp(
            input_size=obs_dim - num_skills,
            hidden_sizes=[net_size, net_size],
            output_size=num_skills,
        )
        density_model = VAEDensity(input_size=obs_dim,
                                   num_skills=num_skills,
                                   code_dim=128,
                                   **variant['vae_density_kwargs'])

        # Overwrite appropriate functions of algorithm.
        smm_algorithm_hook = SMMHook(base_algorithm=algorithm,
                                     discriminator=discriminator,
                                     density_model=density_model,
                                     **variant['smm_kwargs'])
    elif intrinsic_reward == 'icm':
        embedding_model = FlattenMlp(
            input_size=obs_dim,
            hidden_sizes=[net_size, net_size],
            output_size=net_size,
        )
        forward_model = FlattenMlp(
            input_size=net_size + action_dim,
            hidden_sizes=[net_size, net_size],
            output_size=net_size,
        )
        inverse_model = FlattenMlp(
            input_size=net_size + net_size,
            hidden_sizes=[],
            output_size=action_dim,
        )

        # Overwrite appropriate functions of algorithm.
        ICMHook(base_algorithm=algorithm,
                embedding_model=embedding_model,
                forward_model=forward_model,
                inverse_model=inverse_model,
                **variant['icm_kwargs'])
    elif intrinsic_reward == 'count':
        count_algorithm_hook = CountHook(base_algorithm=algorithm,
                                         **variant['count_kwargs'])
    elif intrinsic_reward == 'pseudocount':
        density_model = VAEDensity(
            input_size=obs_dim,
            num_skills=0,
            code_dim=128,
            **variant['vae_density_kwargs'],
        )

        # Overwrite appropriate functions of algorithm.
        PseudocountHook(base_algorithm=algorithm,
                        density_model=density_model,
                        **variant['pseudocount_kwargs'])

    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 24
0
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    print(demos_path)
    buffer_save_dict = joblib.load(demos_path)
    target_state_buffer = buffer_save_dict['data']
    # target_state_buffer /= variant['rescale']
    state_indices = torch.LongTensor(variant['state_indices'])

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the energy model
    if variant['ebil_params']['mode'] == 'deen':
        """
        ebm_model = MLPEBM(
            obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim,
            num_layer_blocks=variant['ebm_num_blocks'],
            hid_dim=variant['ebm_hid_dim'],
            hid_act=variant['ebm_hid_act'],
            use_bn=variant['ebm_use_bn'],
            clamp_magnitude=variant['ebm_clamp_magnitude'],
        )
        """
        ebm_exp_name = 'ebm-deen-' + variant['env_specs'][
            'env_name'] + '-' + str(
                variant['expert_traj_num']) + '-train--sigma-' + str(
                    variant['ebm_sigma'])
        ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name)

        load_ebm_dir = ebm_dir
        load_epoch = variant['ebm_epoch']
        load_name = 'itr_{}.pkl'.format(load_epoch)
        if load_epoch == 'best':
            load_name = 'best.pkl'
        load_ebm_path = os.path.join(load_ebm_dir, load_name)

        load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r')
        ebm_model = load_ebm_pkl['ebm']

    else:
        raise NotImplementedError

    # Test
    if variant['test']:
        batch_data = target_state_buffer / variant['rescale']
        obs = torch.Tensor(batch_data[:1000]).to(ptu.device)
        print("Not expert data", ebm_model(obs * 200).mean().item())
        print("Expert data", ebm_model(obs).mean().item())
        exit(1)

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = EBIL(env=env,
                     training_env=training_env,
                     exploration_policy=policy,
                     rew_func=variant['rew_func'],
                     cons=variant['cons'],
                     rescale=variant['rescale'],
                     ebm=ebm_model,
                     policy_trainer=trainer,
                     target_state_buffer=target_state_buffer,
                     state_indices=state_indices,
                     **variant['ebil_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
Esempio n. 25
0
def experiment(args):
    if not args.cpu:
        ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)

    variant_overwrite = dict(
        # Evaluate model on num_episodes.
        algo_kwargs=dict(
            reward_scale=args.reward_scale,
            collection_mode='episodic',
            num_episodes=args.num_episodes,
            max_path_length=args.max_path_length,
            render=args.render,

            # Evaluate without additional training
            num_updates_per_episode=0,
            min_num_steps_before_training=(
                args.max_path_length * args.num_episodes + 1),
        ),

        # Environment settings
        env_kwargs=dict(
            sample_goal=False,
            goal_prior=args.test_goal,
            shaped_rewards=[
                'object_off_table', 'object_goal_indicator',
                'object_gripper_indicator', 'action_penalty'
            ],
            terminate_upon_success=False,
            terminate_upon_failure=False,
        ),

        # SMM settings
        smm_kwargs=dict(
            # Posterior adaptation of latent skills p(z)
            update_p_z_prior_coeff=args.update_p_z_prior_coeff,

            # Turn off SMM reward.
            state_entropy_coeff=0,
            latent_entropy_coeff=0,
            latent_conditional_entropy_coeff=0,
            discriminator_lr=0,
        ),
    )

    # Load experiment from file.
    env, _, data, variant = load_experiment(args.logdir, variant_overwrite)
    assert all([a == b for a, b in zip(env.sampled_goal, args.test_goal)])
    variant.update(test_goal=list(env.sampled_goal))
    if args.num_historical_policies > 0:
        variant.update(historical_policies_kwargs=dict(
            log_dir=args.logdir,
            num_historical_policies=args.num_historical_policies,
            sample_strategy=args.sample_strategy,
            on_policy_prob=args.on_policy_prob,
        ))

    # Set log directory.
    exp_id = 'eval/ne{}-mpl{}-{}-rs{}/nhp{}-{}-opp{}'.format(
        args.num_episodes,
        args.max_path_length,
        ','.join(variant_overwrite['env_kwargs']['shaped_rewards']),
        args.reward_scale,
        args.num_historical_policies,
        args.sample_strategy,
        args.on_policy_prob,
    )
    exp_id = create_exp_name(exp_id)
    log_dir = os.path.join(args.logdir, exp_id)
    print('Logging to:', log_dir)
    setup_logger(
        log_dir=log_dir,
        variant=variant,
        snapshot_mode='none',
        snapshot_gap=50,
    )

    # Load trained model from file.
    policy = data['policy']
    vf = data['vf']
    qf = data['qf']
    algorithm = SoftActorCritic(
        env=env,
        training_env=env,  # can't clone box2d env cause of swig
        save_environment=False,  # can't save box2d env cause of swig
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_kwargs'],
    )

    # Overwrite algorithm for p(z) adaptation (if model is SMM).
    if 'smm_kwargs' in variant:
        discriminator = data['discriminator']
        density_model = data['density_model']
        SMMHook(base_algorithm=algorithm,
                discriminator=discriminator,
                density_model=density_model,
                **variant['smm_kwargs'])

    # Overwrite algorithm for historical averaging.
    if args.num_historical_policies > 0:
        HistoricalPoliciesHook(
            base_algorithm=algorithm,
            **variant['historical_policies_kwargs'],
        )

    algorithm.to(ptu.device)
    algorithm.train()