Esempio n. 1
0
    def _do_reward_training(self):
        '''
            Train the discriminator
        '''
        self.disc_optimizer.zero_grad()

        expert_batch = self.get_expert_batch(self.disc_optim_batch_size)
        expert_obs = expert_batch['observations']
        expert_actions = expert_batch['actions']

        policy_batch = self.get_policy_batch(self.disc_optim_batch_size)
        policy_obs = policy_batch['observations']
        policy_actions = policy_batch['actions']

        obs = torch.cat([expert_obs, policy_obs], dim=0)
        actions = torch.cat([expert_actions, policy_actions], dim=0)

        disc_logits = self.discriminator(obs, actions)
        disc_preds = (disc_logits > 0).type(torch.FloatTensor)
        disc_loss = self.bce(disc_logits, self.bce_targets)
        accuracy = (disc_preds == self.bce_targets).type(
            torch.FloatTensor).mean()

        if self.use_grad_pen:
            eps = Variable(torch.rand(self.disc_optim_batch_size, 1))
            if ptu.gpu_enabled(): eps = eps.cuda()

            interp_obs = eps * expert_obs + (1 - eps) * policy_obs
            interp_obs.detach()
            interp_obs.requires_grad = True
            interp_actions = eps * expert_actions + (1 - eps) * policy_actions
            interp_actions.detach()
            interp_actions.requires_grad = True
            gradients = autograd.grad(
                outputs=self.discriminator(interp_obs, interp_actions).sum(),
                inputs=[interp_obs, interp_actions],
                # grad_outputs=torch.ones(exp_specs['batch_size'], 1).cuda(),
                create_graph=True,
                retain_graph=True,
                only_inputs=True)
            total_grad = torch.cat([gradients[0], gradients[1]], dim=1)
            gradient_penalty = ((total_grad.norm(2, dim=1) - 1)**2).mean()

            disc_loss = disc_loss + gradient_penalty * self.grad_pen_weight

        disc_loss.backward()
        self.disc_optimizer.step()
        """
        Save some statistics for eval
        """
        if self.rewardf_eval_statistics is None:
            """
            Eval should set this to None.
            This way, these statistics are only computed for one batch.
            """
            self.rewardf_eval_statistics = OrderedDict()
            self.rewardf_eval_statistics['Disc Loss'] = np.mean(
                ptu.get_numpy(disc_loss))
            self.rewardf_eval_statistics['Disc Acc'] = np.mean(
                ptu.get_numpy(accuracy))
Esempio n. 2
0
def experiment(variant):
    env_sampler = MazeSampler(variant['env_specs'])
    env, _ = env_sampler()

    if variant['conv_input']:
        qf = ConvNet(kernel_sizes=variant['kernel_sizes'],
                     num_channels=variant['num_channels'],
                     strides=variant['strides'],
                     paddings=variant['paddings'],
                     hidden_sizes=variant['hidden_sizes'],
                     input_size=env.observation_space.shape,
                     output_size=env.action_space.n)
    else:
        qf = Mlp(
            hidden_sizes=[
                variant['net_size'] for _ in range(variant['num_layers'])
            ],
            input_size=int(np.prod(env.observation_space.shape)),
            output_size=env.action_space.n,
        )
    qf_criterion = nn.MSELoss()
    # Use this to switch to DoubleDQN
    # algorithm = DoubleDQN(
    print('WTF is going on!')
    print(env_sampler)
    algorithm = MetaDQN(env_sampler=env_sampler,
                        qf=qf,
                        qf_criterion=qf_criterion,
                        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 3
0
def experiment(variant):
    expert_buffer = joblib.load(variant['exp_xy_data_path'])['xy_data']
    policy_buffer = joblib.load(variant['pol_xy_data_path'])['xy_data']

    # set up the discriminator models
    if variant['threeway']:
        disc_model_class = ThreeWayResNetAIRLDisc
    else:
        if variant['use_resnet_disc']:
            disc_model_class = ResNetAIRLDisc
        else:
            disc_model_class = StandardAIRLDisc
    disc_model = disc_model_class(
        2,  # obs is just x-y pos
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])
    print(disc_model)
    print(disc_model.clamp_magnitude)

    # set up the AIRL algorithm
    alg_class = ThreeWayFixedDistDiscTrainAlg if variant[
        'threeway'] else FixedDistDiscTrainAlg
    algorithm = alg_class(disc_model, expert_buffer, policy_buffer,
                          **variant['algo_params'])
    print(algorithm.disc_optimizer.defaults['lr'])

    # train
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Esempio n. 4
0
def experiment(variant):
    #env = NormalizedBoxEnv(HalfCheetahEnv())
    # Or for a specific version:
    # import gym
    env = NormalizedBoxEnv(gym.make('Pointmass-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 5
0
    def __init__(
        self,
        train_dataset,
        test_dataset,
        model,
        batch_size=128,
        log_interval=0,
        beta=0.5,
        beta_schedule=None,
        imsize=84,
        lr=1e-3,
        do_scatterplot=False,
        normalize=False,
        state_sim_debug=False,
        mse_weight=0.1,
        is_auto_encoder=False,
        lmbda=0.5,
        mu=1,
        gamma=0.2,
    ):
        self.log_interval = log_interval
        self.batch_size = batch_size
        self.beta = beta
        if is_auto_encoder:
            self.beta = 0
        self.beta_schedule = beta_schedule
        if self.beta_schedule is None:
            self.beta_schedule = ConstantSchedule(self.beta)
        self.imsize = imsize
        self.do_scatterplot = do_scatterplot
        self.lmbda = lmbda
        self.mu = mu
        self.gamma = gamma
        """
        I think it's a bit nicer if the caller makes this call, i.e.
        ```
        m = ConvVAE(representation_size)
        if ptu.gpu_enabled():
            m.cuda()
        t = ConvVAETrainer(train_data, test_data, m)
        ```
        However, I'll leave this here for backwards-compatibility.
        """
        if ptu.gpu_enabled():
            model.cuda()

        self.model = model
        self.representation_size = model.representation_size
        self.input_channels = model.input_channels
        self.imlength = model.imlength

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.train_dataset, self.test_dataset = train_dataset, test_dataset
        self.normalize = normalize
        self.state_sim_debug = state_sim_debug
        self.mse_weight = mse_weight
        self.x_next_index = self.input_channels * self.imsize**2

        if self.normalize:
            self.train_data_mean = np.mean(self.train_dataset, axis=0)
Esempio n. 6
0
def experiment(variant):
    farmlist_base = [('123.123.123.123', 4)]

    farmer = Farmer(farmlist_base)
    environment = acq_remote_env(farmer)
    env = NormalizedBoxEnv(environment)

    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 7
0
def experiment(user_variant):
    variant = default_variant.copy()
    variant.update(user_variant)

    if ptu.gpu_enabled():
        enable_gpus("0")

    env_id = variant["env"]
    env = build_env(env_id)

    agent_configs = variant["agent_configs"]
    agent = build_agent(env, env_id, agent_configs)
    agent.visualize = variant["visualize"]
    model_file = variant.get("model_file")
    if (model_file is not ""):
        agent.load_model(model_file)

    log_dir = logger.get_snapshot_dir()
    if (variant["train"]):
        agent.train(max_iter=variant["max_iter"],
                    test_episodes=variant["test_episodes"],
                    output_dir=log_dir,
                    output_iters=variant["output_iters"])
    else:
        agent.eval(num_episodes=variant["test_episodes"])

    return
    def __init__(
        self,
        discriminator,
        exp_data,
        pol_data,
        disc_optim_batch_size=1024,
        num_update_loops_per_train_call=1,
        num_disc_updates_per_loop_iter=1,
        disc_lr=1e-3,
        disc_momentum=0.0,
        disc_optimizer_class=optim.Adam,
        use_grad_pen=True,
        grad_pen_weight=10,
        train_objective='airl',
    ):
        assert disc_lr != 1e-3, 'Just checking that this is being taken from the spec file'

        self.exp_data, self.pol_data = exp_data, pol_data

        self.discriminator = discriminator
        self.rewardf_eval_statistics = None
        self.disc_optimizer = disc_optimizer_class(
            self.discriminator.parameters(),
            lr=disc_lr,
            betas=(disc_momentum, 0.999))
        print('\n\nDISC MOMENTUM: %f\n\n' % disc_momentum)

        self.disc_optim_batch_size = disc_optim_batch_size

        assert train_objective in ['airl', 'fairl', 'gail', 'w1']
        self.train_objective = train_objective

        self.bce = nn.BCEWithLogitsLoss()
        target_batch_size = self.disc_optim_batch_size
        self.bce_targets = torch.cat([
            torch.ones(target_batch_size, 1),
            torch.zeros(target_batch_size, 1)
        ],
                                     dim=0)
        self.bce_targets = Variable(self.bce_targets)
        if ptu.gpu_enabled():
            self.bce.cuda()
            self.bce_targets = self.bce_targets.cuda()

        self.use_grad_pen = use_grad_pen
        self.grad_pen_weight = grad_pen_weight

        self.num_update_loops_per_train_call = num_update_loops_per_train_call
        self.num_disc_updates_per_loop_iter = num_disc_updates_per_loop_iter

        d = 5.0
        self._d = d
        self._d_len = np.arange(-d, d + 0.25, 0.25).shape[0]
        self.xy_var = []
        for i in np.arange(-d, d + 0.25, 0.25):
            for j in np.arange(-d, d + 0.25, 0.25):
                self.xy_var.append([float(i), float(j)])
        self.xy_var = np.array(self.xy_var)
        self.xy_var = Variable(ptu.from_numpy(self.xy_var),
                               requires_grad=False)
def experiment(variant):
    # make the disc model
    z_dim = variant['algo_params']['z_dim']

    # make the MLP
    # hidden_sizes = [variant['algo_params']['mlp_hid_dim']] * variant['algo_params']['mlp_layers']
    # mlp = Mlp(
    #     hidden_sizes,
    #     output_size=1,
    #     input_size=48 + z_dim,
    #     batch_norm=variant['algo_params']['mlp_use_bn']
    # )

    algorithm = FetchShapeTaskDesign(
        # mlp,
        **variant['algo_params']
    )

    # for _ in range(100):
    #     # print(algorithm._get_any())
    #     # print(algorithm._get_except(0,1))

    #     img = algorithm._get_image_without_object(1, 2)
    #     print('-------')
    #     print(img[:6])
    #     print(img[6:12])
    #     print(img[12:18])
    #     print(img[18:24])
    # 1/0

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    env_sampler = MazeSampler(env_specs)
    sample_env, _ = env_sampler()
    meta_params_dim = 0

    obs_dim = int(np.prod(sample_env.observation_space.shape))
    if isinstance(sample_env.action_space, Discrete):
        action_dim = int(sample_env.action_space.n)
    else:
        action_dim = int(np.prod(sample_env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + meta_params_dim,
        output_size=action_dim,
    )
    policy = DiscreteQWrapperPolicy(qf)

    algorithm = MetaSoftQLearning(env_sampler=env_sampler,
                                  qf=qf,
                                  policy=policy,
                                  **variant['algo_params'])
    # assert False, "Have not added new sac yet!"
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Esempio n. 11
0
def experiment(variant):
    from rlkit.core import logger
    import rlkit.torch.pytorch_util as ptu
    beta = variant["beta"]
    representation_size = variant["representation_size"]
    train_data, test_data, info = generate_vae_dataset(
        **variant['get_data_kwargs'])
    logger.save_extra_data(info)
    logger.get_snapshot_dir()
    if 'beta_schedule_kwargs' in variant:
        beta_schedule = PiecewiseLinearSchedule(
            **variant['beta_schedule_kwargs'])
    else:
        beta_schedule = None
    m = ConvVAE(representation_size,
                input_channels=3,
                **variant['conv_vae_kwargs'])
    if ptu.gpu_enabled():
        m.to(ptu.device)
    t = ConvVAETrainer(train_data,
                       test_data,
                       m,
                       beta=beta,
                       beta_schedule=beta_schedule,
                       **variant['algo_kwargs'])
    save_period = variant['save_period']
    for epoch in range(variant['num_epochs']):
        should_save_imgs = (epoch % save_period == 0)
        t.train_epoch(epoch)
        t.test_epoch(epoch,
                     save_reconstruction=should_save_imgs,
                     save_scatterplot=should_save_imgs)
        if should_save_imgs:
            t.dump_samples(epoch)
Esempio n. 12
0
    def forward(self, obs_batch, act_batch, prev_h_batch, prev_c_batch):
        lstm_act_proc = self.lstm_act_proc_fc(act_batch)
        recon_act_proc = self.recon_act_proc_fc(act_batch)
        
        batch_size = obs_batch.size(0)
        att_prev_h_batch = Variable(torch.zeros(batch_size, self.flat_inter_img_dim))
        att_prev_c_batch = Variable(torch.zeros(batch_size, self.flat_inter_img_dim))
        if ptu.gpu_enabled():
            att_prev_h_batch = att_prev_h_batch.cuda()
            att_prev_c_batch = att_prev_c_batch.cuda()
        
        self.reg_loss = 0.
        
        att_input = torch.cat([prev_h_batch, recon_act_proc], 1)
        for _ in range(4):
            att_prev_h_batch, att_prev_c_batch = self.attention_lstm(att_input, (att_prev_h_batch, att_prev_c_batch))

        hidden = att_prev_h_batch.view(obs_batch.size(0), self.conv_channels, self.img_h, self.img_h)
        hidden = self.conv_decoder(hidden)
        recon = self.mean_decoder(hidden)
        log_cov = self.log_cov_decoder(hidden)
        log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX)

        enc = self.conv_encoder(obs_batch)
        enc = enc.view(obs_batch.size(0), -1)
        enc = self.fc_encoder(torch.cat([enc, lstm_act_proc], 1))
        prev_h_batch, prev_c_batch = self.lstm(enc, (prev_h_batch, prev_c_batch))

        return recon, log_cov, prev_h_batch, prev_c_batch
Esempio n. 13
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 14
0
def experiment(variant):
    env = NormalizedBoxEnv(PointEnv(**variant['task_params']))
    ptu.set_gpu_mode(variant['use_gpu'], variant['gpu_id'])

    tasks = env.get_all_task_idx()

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    latent_dim = 5
    task_enc_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim
    reward_dim = 1

    net_size = variant['net_size']
    # start with linear task encoding
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder
    task_enc = encoder_model(
            hidden_sizes=[200, 200, 200], # deeper net + higher dim space generalize better
            input_size=obs_dim + action_dim + reward_dim,
            output_size=task_enc_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = ProtoAgent(
        latent_dim,
        [task_enc, policy, qf1, qf2, vf],
        **variant['algo_params']
    )

    algorithm = ProtoSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:-20]),
        eval_tasks=list(tasks[-20:]),
        nets=[agent, task_enc, policy, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.to()
    algorithm.train()
Esempio n. 15
0
def experiment(variant):
    algorithm = joblib.load(variant['ckpt_path'])['algorithm']
    if ptu.gpu_enabled():
        algorithm.cuda()

    tuner = FetchTuner(algorithm, **variant['algo_params'])
    tuner.train()
    return 1
def experiment(variant):
    task_mode = variant['task_mode'] # train, test, eval
    task_idx = variant['task_idx']

    if task_mode == 'train':
        task_sampler = WalkerTrainParamsSampler()
    elif task_mode == 'test':
        task_sampler = WalkerTestParamsSampler()
    else:
        raise NotImplementedError()
    task_params = task_sampler.get_task(task_idx)
    obs_task_params = task_sampler.get_obs_task_params(task_params)
    env = SingleTaskWalkerEnv(task_params, obs_task_params)
    training_env = SingleTaskWalkerEnv(task_params, obs_task_params)

    print(env.observation_space)
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    hidden_sizes = [net_size] * variant['num_hidden_layers']
    print('Using simple model')
    qf1 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=hidden_sizes,
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=hidden_sizes,
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = NewSoftActorCritic(
        env=env,
        training_env=training_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        **variant['algo_params']
    )
    
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Esempio n. 17
0
def experiment(variant):
    # we have to generate the combinations for the env_specs
    env_specs = variant['env_specs']
    env_specs_vg = VariantGenerator()
    env_spec_constants = {}
    for k, v in env_specs.items():
        if isinstance(v, list):
            env_specs_vg.add(k, v)
        else:
            env_spec_constants[k] = v

    env_specs_list = []
    for es in env_specs_vg.variants():
        del es['_hidden_keys']
        es.update(env_spec_constants)
        env_specs_list.append(es)
    print(env_specs_list)

    print(env_specs_list[0])
    env_sampler = EnvSampler(env_specs_list)

    # set up similar to non-meta version
    sample_env, _ = env_sampler()
    if variant['algo_params']['concat_env_params_to_obs']:
        meta_params_dim = sample_env.env_meta_params.shape[0]
    else:
        meta_params_dim = 0
    obs_dim = int(np.prod(sample_env.observation_space.shape))
    action_dim = int(np.prod(sample_env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim + meta_params_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + meta_params_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + meta_params_dim,
        action_dim=action_dim,
    )
    algorithm = MetaSoftActorCritic(env_sampler=env_sampler,
                                    policy=policy,
                                    qf=qf,
                                    vf=vf,
                                    **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Esempio n. 18
0
 def preload(self, batch_size):
     try:
         self.batch = self.random_batch(batch_size)
     except StopIteration:
         self.batch = None
         return
     if ptu.gpu_enabled():
         # with torch.cuda.stream(self.stream):
         for k in self.batch:
             self.batch[k] = self.batch[k].to(device=ptu.device,
                                              non_blocking=True)
Esempio n. 19
0
def experiment(specs):
    with open(path.join(specs['specific_exp_dir'], 'variant.json'), 'r') as f:
        variant = json.load(f)
    variant['algo_params']['do_not_train'] = True
    variant['seed'] = specs['seed']
    policy = joblib.load(path.join(specs['specific_exp_dir'],
                                   'params.pkl'))['exploration_policy']

    assert False, 'Do you really wanna make it deterministic?'
    policy = MakeDeterministic(policy)

    env_specs = variant['env_specs']
    env, _ = get_env(env_specs)
    training_env, _ = get_env(env_specs)

    variant['algo_params']['replay_buffer_size'] = int(
        np.floor(specs['num_episodes'] *
                 variant['algo_params']['max_path_length'] /
                 specs['subsampling']))
    # Hack until I figure out how things are gonna be in general then I'll clean it up
    if 'policy_uses_pixels' not in variant['algo_params']:
        variant['algo_params']['policy_uses_pixels'] = False
    if 'policy_uses_task_params' not in variant['algo_params']:
        variant['algo_params']['policy_uses_task_params'] = False
    if 'concat_task_params_to_policy_obs' not in variant['algo_params']:
        variant['algo_params']['concat_task_params_to_policy_obs'] = False
    replay_buffer = ExpertReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        env,
        subsampling=specs['subsampling'],
        policy_uses_pixels=variant['algo_params']['policy_uses_pixels'],
        policy_uses_task_params=variant['algo_params']
        ['policy_uses_task_params'],
        concat_task_params_to_policy_obs=variant['algo_params']
        ['concat_task_params_to_policy_obs'],
    )
    variant['algo_params']['freq_saving'] = 1

    algorithm = ExpertTrajGeneratorAlgorithm(
        env=env,
        training_env=training_env,
        exploration_policy=policy,
        replay_buffer=replay_buffer,
        max_num_episodes=specs['num_episodes'],
        **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
Esempio n. 20
0
    def __init__(self,
                 env,
                 policy,
                 discriminator,
                 policy_optimizer,
                 expert_replay_buffer,
                 disc_optim_batch_size=32,
                 policy_optim_batch_size=1000,
                 disc_lr=1e-3,
                 disc_optimizer_class=optim.Adam,
                 use_grad_pen=True,
                 grad_pen_weight=10,
                 plotter=None,
                 render_eval_paths=False,
                 eval_deterministic=True,
                 **kwargs):
        assert disc_lr != 1e-3, 'Just checking that this is being taken from the spec file'
        if eval_deterministic:
            eval_policy = MakeDeterministic(policy)
        else:
            eval_policy = policy
        super().__init__(env=env,
                         exploration_policy=policy,
                         eval_policy=eval_policy,
                         expert_replay_buffer=expert_replay_buffer,
                         policy_optimizer=policy_optimizer,
                         **kwargs)

        self.discriminator = discriminator
        self.rewardf_eval_statistics = None
        self.disc_optimizer = disc_optimizer_class(
            self.discriminator.parameters(),
            lr=disc_lr,
        )

        self.disc_optim_batch_size = disc_optim_batch_size
        self.policy_optim_batch_size = policy_optim_batch_size

        self.bce = nn.BCEWithLogitsLoss()
        self.bce_targets = torch.cat([
            torch.ones(self.disc_optim_batch_size, 1),
            torch.zeros(self.disc_optim_batch_size, 1)
        ],
                                     dim=0)
        self.bce_targets = Variable(self.bce_targets)
        if ptu.gpu_enabled():
            self.bce.cuda()
            self.bce_targets = self.bce_targets.cuda()

        self.use_grad_pen = use_grad_pen
        self.grad_pen_weight = grad_pen_weight
def continue_experiment(args):
    logger.add_text_output('./d_text.txt')
    logger.add_tabular_output('./d_tabular.txt')
    logger.set_snapshot_dir('./snaps')

    extra = joblib.load(args.extra)

    algorithm = extra['algorithm']
    algorithm.farmlist_base = [('0.0.0.0', 15)]
    algorithm.refarm()

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 22
0
def experiment(variant):
    env = NormalizedBoxEnv(
        GoalXYPosAndVelAnt(
            goal_dim_weights=[0.1, 0.1, 0.9, 0.9],
            speed_weight=None,
        ))
    max_tau = variant['tdm_kwargs']['max_tau']
    # Normalizer isn't used unless you set num_pretrain_paths > 0
    tdm_normalizer = TdmNormalizer(
        env,
        vectorized=True,
        max_tau=max_tau,
    )
    qf = TdmQf(
        env=env,
        vectorized=True,
        norm_order=1,
        tdm_normalizer=tdm_normalizer,
        hidden_sizes=[300, 300],
    )
    policy = TdmPolicy(
        env=env,
        tdm_normalizer=tdm_normalizer,
        hidden_sizes=[300, 300],
    )
    es = OUStrategy(
        action_space=env.action_space,
        theta=0.1,
        max_sigma=0.1,
        min_sigma=0.1,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = HerReplayBuffer(
        env=env,
        max_size=int(1E6),
    )
    algorithm = TemporalDifferenceModel(env,
                                        qf=qf,
                                        replay_buffer=replay_buffer,
                                        policy=policy,
                                        exploration_policy=exploration_policy,
                                        qf_criterion=HuberLoss(),
                                        tdm_normalizer=tdm_normalizer,
                                        **variant['tdm_kwargs'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 23
0
def experiment(variant):
    # env = NormalizedBoxEnv(HalfCheetahEnv())
    # env = NormalizedBoxEnv(InvertedPendulumEnv())
    # ---------
    # env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))
    # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs']))

    env = ReacherEnv()
    training_env = ReacherEnv()
    
    # Or for a specific version:
    # import gym
    # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    total_meta_variable_dim = 0
    for dims in exp_specs['true_meta_variable_dims']:
        total_meta_variable_dim += sum(dims)

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim + total_meta_variable_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + total_meta_variable_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + total_meta_variable_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(
        env=env,
        training_env=training_env,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1
def experiment(variant):
    logger.add_text_output('./d_text.txt')
    logger.add_tabular_output('./d_tabular.txt')
    logger.set_snapshot_dir('./snaps')

    farmer = Farmer([('0.0.0.0', 1)])
    remote_env = farmer.force_acq_env()
    remote_env.set_spaces()
    env = NormalizedBoxEnv(remote_env)

    es = GaussianStrategy(
        action_space=env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[256, 256],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[256, 256],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[256, 256],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 25
0
def experiment(variant):
    env = NormalizedBoxEnv(GoalXVelHalfCheetah())
    max_tau = variant['tdm_kwargs']['max_tau']
    tdm_normalizer = TdmNormalizer(
        env,
        vectorized=True,
        max_tau=max_tau,
    )
    qf = TdmQf(
        env=env,
        vectorized=True,
        norm_order=1,
        tdm_normalizer=tdm_normalizer,
        hidden_sizes=[300, 300],
    )
    policy = TdmPolicy(
        env=env,
        tdm_normalizer=tdm_normalizer,
        hidden_sizes=[300, 300],
    )
    es = OUStrategy(
        action_space=env.action_space,
        theta=0.1,
        max_sigma=0.1,
        min_sigma=0.1,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = HerReplayBuffer(
        env=env,
        max_size=int(1E6),
    )
    algorithm = TemporalDifferenceModel(env,
                                        qf=qf,
                                        replay_buffer=replay_buffer,
                                        policy=policy,
                                        exploration_policy=exploration_policy,
                                        tdm_normalizer=tdm_normalizer,
                                        qf_criterion=HuberLoss(),
                                        **variant['tdm_kwargs'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 26
0
    def __init__(self, max_replay_buffer_size, env, env_info_sizes=None):
        observation_dim = get_dim(env.observation_space)
        action_dim = get_dim(env.action_space)

        if env_info_sizes is None:
            if hasattr(env, 'info_sizes'):
                env_info_sizes = env.info_sizes
            else:
                env_info_sizes = dict()

        self._max_replay_buffer_size = max_replay_buffer_size
        self._observations = torch.zeros(
            (max_replay_buffer_size, observation_dim),
            dtype=torch.float).pin_memory()
        # It's a bit memory inefficient to save the observations twice,
        # but it makes the code *much* easier since you no longer have to
        # worry about termination conditions.
        self._next_obs = torch.zeros((max_replay_buffer_size, observation_dim),
                                     dtype=torch.float).pin_memory()
        self._actions = torch.zeros((max_replay_buffer_size, action_dim),
                                    dtype=torch.float).pin_memory()
        # Make everything a 2D np array to make it easier for other code to
        # reason about the shape of the data
        self._rewards = torch.zeros((max_replay_buffer_size, 1),
                                    dtype=torch.float).pin_memory()
        # self._terminals[i] = a terminal was received at time i
        self._terminals = torch.zeros((max_replay_buffer_size, 1),
                                      dtype=torch.float).pin_memory()
        # Define self._env_infos[key][i] to be the return value of env_info[key]
        # at time i
        self._env_infos = {}
        for key, size in env_info_sizes.items():
            self._env_infos[key] = torch.zeros((max_replay_buffer_size, size),
                                               dtype=torch.float).pin_memory()
        self._env_info_keys = env_info_sizes.keys()

        self._top = 0
        self._size = 0

        if ptu.gpu_enabled():
            # self.stream = torch.cuda.Stream(ptu.device)
            self.batch = None
Esempio n. 27
0
 def __init__(
         self,
         X_train,
         X_test,
         y_train,
         y_test,
         model,
         batch_size=128,
         lr=3e-4,
         weight_decay=0,
         num_batches = 128,
 ):
     self.batch_size = batch_size
     if ptu.gpu_enabled():
         model.to(ptu.device)
     self.model = model
     self.criterion = nn.MSELoss()
     self.optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
     self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test
     self.num_batches = num_batches
Esempio n. 28
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))

    num_skills = variant['num_skills']
    '''observation dim includes dim of latent variable'''
    obs_dim = int(np.prod(env.observation_space.shape)) + num_skills
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )

    # TODO: VERIFY THIS
    # num_skills=variant['num_skills']
    discrim = FlattenMlp(hidden_sizes=[net_size, net_size],
                         input_size=obs_dim - num_skills,
                         output_size=num_skills,
                         output_activation=nn.Sigmoid())

    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = DIAYN(env=env,
                      policy=policy,
                      qf=qf,
                      vf=vf,
                      discrim=discrim,
                      **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 29
0
def her_twin_sac_experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    observation_key = variant.get('observation_key', 'observation')
    desired_goal_key = variant.get('desired_goal_key', 'desired_goal')
    replay_buffer = ObsDictRelabelingBuffer(env=env,
                                            observation_key=observation_key,
                                            desired_goal_key=desired_goal_key,
                                            **variant['replay_buffer_kwargs'])
    obs_dim = env.observation_space.spaces['observation'].low.size
    action_dim = env.action_space.low.size
    goal_dim = env.observation_space.spaces['desired_goal'].low.size
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    qf1 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    qf2 = ConcatMlp(input_size=obs_dim + action_dim + goal_dim,
                    output_size=1,
                    **variant['qf_kwargs'])
    vf = ConcatMlp(input_size=obs_dim + goal_dim,
                   output_size=1,
                   **variant['vf_kwargs'])
    policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim,
                                action_dim=action_dim,
                                **variant['policy_kwargs'])
    algorithm = HerTwinSac(env,
                           qf1=qf1,
                           qf2=qf2,
                           vf=vf,
                           policy=policy,
                           replay_buffer=replay_buffer,
                           observation_key=observation_key,
                           desired_goal_key=desired_goal_key,
                           **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        qf1.to(ptu.device)
        qf2.to(ptu.device)
        vf.to(ptu.device)
        policy.to(ptu.device)
        algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 30
0
def experiment(variant,env=None):
    if env is None:
        # default setting of environment
        env = NormalizedBoxEnv(HopperEnv())
    es = GaussianStrategy(
        action_space=env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf1,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()