コード例 #1
0
ファイル: mem_ddpg_exp.py プロジェクト: jcoreyes/erl
def run_linear_ocm_exp(variant):
    from rlkit.tf.ddpg import DDPG
    from rlkit.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented
    )
    from rlkit.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly,
    )
    from rlkit.launchers.launcher_util import (
        set_seed,
    )

    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']
    algo_params = variant['algo_params']

    set_seed(seed)
    onehot_dim = num_values + 1

    env_action_dim = num_values + 1

    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    # env = FlattenedProductBox(env)

    # qf = FeedForwardCritic(
    #     name_or_scope="critic",
    #     env_spec=env.spec,
    # )
    qf = MlpMemoryQFunction(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = ActionAwareMemoryPolicy(
        name_or_scope="noisy_policy",
        action_dim=env_action_dim,
        memory_dim=memory_dim,
        env_spec=env.spec,
    )
    es = OUStrategy(env_spec=env.spec)
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **algo_params
    )

    algorithm.train()
コード例 #2
0
def main():
    # noinspection PyTypeChecker
    set_seed(args.seed)
    variant = dict(
        algo_params=dict(
            num_epochs=1000,
            num_steps_per_epoch=args.steps_per_epoch,
            num_steps_per_eval=1000,
            batch_size=args.batch_size,
            max_path_length=999,
            discount=0.99,
            reward_scale=args.reward_scale,
            soft_target_tau=0.001,
            policy_lr=3E-4,
            qf_lr=3E-4,
            vf_lr=3E-4,
            collection_mode=args.train_mode,
            num_updates_per_epoch=args.updates_per_epoch,
            num_threads=args.num_threads,
        ),
        net_size=args.net_size,
    )
    setup_logger(args.env_name,
                 variant=variant,
                 exp_id=args.exp_name,
                 seed=args.seed)
    ptu.set_gpu_mode(not args.cpu, gpu_id=args.gpu_id)
    experiment(variant)
コード例 #3
0
ファイル: sl.py プロジェクト: jcoreyes/erl
def main():
    n_seeds = 1
    mode = "here"
    exp_prefix = "dev-sl"

    # n_seeds = 10
    # mode = "ec2"
    exp_prefix = "paper-6-14-HL-sl-H25"

    H = 25
    # noinspection PyTypeChecker
    variant = dict(
        H=H,
        exp_prefix=exp_prefix,
        algo_params=dict(
            num_batches_per_epoch=100,
            num_epochs=30,
            learning_rate=1e-3,
            batch_size=1000,
            eval_num_episodes=64,
            lstm_state_size=10,
            # rnn_cell_class=LSTMCell,
            # rnn_cell_params=dict(
            #     use_peepholes=True,
            # ),
            rnn_cell_class=SeparateLstmLinearCell,
            rnn_cell_params=dict(
                use_peepholes=True,
                env_noise_std=0,
                memory_noise_std=0,
                output_nonlinearity=tf.nn.tanh,
                # output_nonlinearity=tf.nn.softmax,
                env_hidden_sizes=[],
                output_dim=1,
            ),
            softmax=False,
        ),
        version='Supervised Learning',
        env_class=HighLow,
        env_params=dict(horizon=H, )
        # env_class=OneCharMemory,
    )

    exp_id = -1
    for _ in range(n_seeds):
        seed = random.randint(0, 999999)
        exp_id += 1
        set_seed(seed)
        variant['seed'] = seed
        variant['exp_id'] = exp_id

        run_experiment(
            bptt_launcher,
            exp_prefix=exp_prefix,
            seed=seed,
            mode=mode,
            variant=variant,
            exp_id=exp_id,
        )
コード例 #4
0
ファイル: tf_mem_ddpg.py プロジェクト: jcoreyes/erl
def run_linear_ocm_exp(variant):
    from rlkit.tf.ddpg import DDPG
    from rlkit.envs.flattened_product_box import FlattenedProductBox
    from rlkit.exploration_strategies.ou_strategy import OUStrategy
    from rlkit.tf.policies.nn_policy import FeedForwardPolicy
    from rlkit.qfunctions.nn_qfunction import FeedForwardCritic
    from rlkit.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented
    )
    from rlkit.launchers.launcher_util import (
        set_seed,
    )

    """
    Set up experiment variants.
    """
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_class = variant['env_class']
    env_params = variant['env_params']
    memory_dim = variant['memory_dim']
    ou_params = variant['ou_params']

    set_seed(seed)

    """
    Code for running the experiment.
    """

    env = env_class(**env_params)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=memory_dim,
    )
    env = FlattenedProductBox(env)

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="policy",
        env_spec=env.spec,
    )
    es = OUStrategy(
        env_spec=env.spec,
        **ou_params
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **algo_params
    )

    algorithm.train()
コード例 #5
0
def exp_fn(variant):
    exp_id = variant['exp_id']

    print(variant.keys())
    exp_prefix = variant['exp_name']
    set_seed(exp_specs['seed'])
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=variant)

    # run the experiment
    exp_return = experiment(variant)
    return exp_return
コード例 #6
0
ファイル: trpo_memory_exp.py プロジェクト: jcoreyes/erl
def run_linear_ocm_exp(variant):
    from sandbox.rocky.tf.algos.trpo import TRPO
    from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
    from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
    from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import (
        ConjugateGradientOptimizer,
        FiniteDifferenceHvp,
    )
    from rlkit.envs.flattened_product_box import FlattenedProductBox
    from rlkit.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from rlkit.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from rlkit.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']

    set_seed(seed)
    onehot_dim = num_values + 1
    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    env = FlattenedProductBox(env)

    policy = GaussianMLPPolicy(
        name="policy",
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(32, 32),
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    optimizer_params = variant['optimizer_params']
    trpo_params = variant['trpo_params']
    algo = TRPO(env=env,
                policy=policy,
                baseline=baseline,
                optimizer=ConjugateGradientOptimizer(
                    hvp_approach=FiniteDifferenceHvp(**optimizer_params)),
                **trpo_params)

    algo.train()
コード例 #7
0
def run_linear_ocm_exp(variant):
    from rlkit.tf.ddpg_ocm import DdpgOcm
    from rlkit.qfunctions.memory.mlp_memory_qfunction import MlpMemoryQFunction
    from rlkit.exploration_strategies.noop import NoopStrategy
    from rlkit.exploration_strategies.onehot_sampler import OneHotSampler
    from rlkit.exploration_strategies.product_strategy import ProductStrategy
    from rlkit.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from rlkit.envs.memory.one_char_memory import OneCharMemoryEndOnly
    from rlkit.tf.policies.memory.linear_ocm_policy import LinearOcmPolicy
    from rlkit.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']
    ddpg_params = variant['ddpg_params']

    onehot_dim = num_values + 1
    set_seed(seed)
    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )

    policy = LinearOcmPolicy(
        name_or_scope="policy",
        memory_and_action_dim=onehot_dim,
        env_spec=env.spec,
    )

    es = ProductStrategy([OneHotSampler(), NoopStrategy()])
    qf = MlpMemoryQFunction(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    algorithm = DdpgOcm(env, es, policy, qf, **ddpg_params)

    algorithm.train()
コード例 #8
0
def main(env_name, exp_name, seed, horizon, episodes, cpu, stochastic):
    if not cpu:
        set_gpu_mode(True)
    set_seed(seed)
    env = gym.make(env_name)
    env.seed(seed)
    env.set_eval()
    log_dir = settings.log_dir()

    if exp_name:
        policy = utils.load(log_dir, exp_name, cpu, stochastic)
        if stochastic:
            num_params = policy.num_params()
        else:
            num_params = policy.stochastic_policy.num_params()
        print(f"num params: {num_params}")
    else:
        policy = RandomPolicy(env)

    render = episodes == 0

    reset_kwargs = {}

    def rollout_fn():
        return multitask_rollout(
            env,
            policy,
            horizon,
            render,
            observation_key="observation",
            desired_goal_key="desired_goal",
            representation_goal_key="representation_goal",
            **reset_kwargs,
        )

    if render:
        paths = utils.render(env, rollout_fn)
    else:
        success_rate, n_col, paths_states = utils.evaluate(
            rollout_fn, episodes)
        print(f"Success rate: {success_rate} - Collisions: {n_col}")
コード例 #9
0
ファイル: tf_ddpg.py プロジェクト: jcoreyes/erl
def run_linear_ocm_exp(variant):
    from rlkit.tf.ddpg import DDPG
    from rlkit.launchers.launcher_util import (
        set_seed, )
    from rlkit.exploration_strategies.ou_strategy import OUStrategy
    from rlkit.tf.policies.nn_policy import FeedForwardPolicy
    from rlkit.qfunctions.nn_qfunction import FeedForwardCritic
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_class = variant['env_class']
    env_params = variant['env_params']
    ou_params = variant['ou_params']

    set_seed(seed)
    """
    Code for running the experiment.
    """

    env = env_class(**env_params)

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="policy",
        env_spec=env.spec,
    )
    es = OUStrategy(env_spec=env.spec, **ou_params)
    algorithm = DDPG(env, es, policy, qf, **algo_params)

    algorithm.train()
コード例 #10
0
ファイル: train.py プロジェクト: maximecolignon/nmprepr
def main(
    env_name,
    exp_dir,
    seed,
    resume,
    mode,
    archi,
    epochs,
    reward_scale,
    intrinsic_reward_scale,
    hidden_dim,
    batch_size,
    learning_rate,
    n_layers,
    soft_target_tau,
    auto_alpha,
    alpha,
    frac_goal_replay,
    horizon,
    replay_buffer_size,
    snapshot_mode,
    snapshot_gap,
    cpu,
):
    valid_modes = ["vanilla", "her", "her+icm", "icm"]
    valid_archi = [
        "mlp",
        "cnn",
        "pointnet",
    ]
    if mode not in valid_modes:
        raise ValueError(f"Unknown mode: {mode}")
    if archi not in valid_archi:
        raise ValueError(f"Unknown network archi: {archi}")

    machine_log_dir = settings.log_dir()
    exp_dir = os.path.join(machine_log_dir, exp_dir, f"seed{seed}")
    # multi-gpu and batch size scaling
    replay_buffer_size = replay_buffer_size
    num_expl_steps_per_train_loop = 1000
    num_eval_steps_per_epoch = 1000
    min_num_steps_before_training = 1000
    num_trains_per_train_loop = 1000
    # learning rate and soft update linear scaling
    policy_lr = learning_rate
    qf_lr = learning_rate
    variant = dict(
        env_name=env_name,
        algorithm="sac",
        version="normal",
        seed=seed,
        resume=resume,
        mode=mode,
        archi=archi,
        replay_buffer_kwargs=dict(max_replay_buffer_size=replay_buffer_size, ),
        algorithm_kwargs=dict(
            batch_size=batch_size,
            num_epochs=epochs,
            num_eval_steps_per_epoch=num_eval_steps_per_epoch,
            num_expl_steps_per_train_loop=num_expl_steps_per_train_loop,
            num_trains_per_train_loop=num_trains_per_train_loop,
            min_num_steps_before_training=min_num_steps_before_training,
            max_path_length=horizon,
        ),
        trainer_kwargs=dict(
            discount=0.99,
            soft_target_tau=soft_target_tau,
            target_update_period=1,
            policy_lr=policy_lr,
            qf_lr=qf_lr,
            reward_scale=reward_scale,
            use_automatic_entropy_tuning=auto_alpha,
            alpha=alpha,
        ),
        qf_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers),
        policy_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers),
        icm_kwargs=dict(hidden_dim=hidden_dim, n_layers=n_layers),
        log_dir=exp_dir,
    )

    if mode in ["her", "her+icm"]:
        variant["replay_buffer_kwargs"].update(
            dict(
                fraction_goals_rollout_goals=1 -
                frac_goal_replay,  # equal to k = 4 in HER paper
                fraction_goals_env_goals=0,
            ))

    if mode in ["her+icm", "icm"]:
        #TODO: Add here ICM specific actions
        variant['trainer_kwargs'][
            'intrinsic_reward_scale'] = intrinsic_reward_scale
        if archi != "pointnet":
            raise Exception("ICM can only handle pointnet architecture")

    set_seed(seed)

    setup_logger_kwargs = {
        "exp_prefix": exp_dir,
        "variant": variant,
        "log_dir": exp_dir,
        "snapshot_mode": snapshot_mode,
        "snapshot_gap": snapshot_gap,
    }
    setup_logger(**setup_logger_kwargs)
    ptu.set_gpu_mode(not cpu, distributed_mode=False)
    print(f"Start training...")
    sac(variant)
コード例 #11
0
    # Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--experiment', help='experiment specification file')
    args = parser.parse_args()
    with open(args.experiment, 'r') as spec_file:
        spec_string = spec_file.read()
        exp_specs = yaml.load(spec_string)
    exp_path = exp_specs['exp_path']
    sub_exp = exp_specs['sub_exp']
    sample_from_prior = exp_specs['sample_from_prior']

    print('\n\nUSING GPU\n\n')
    ptu.set_gpu_mode(True)

    # seed
    set_seed(EVAL_SEED)

    # load the expert replay buffer
    expert_buffer = joblib.load(EXPERT_BUFFER_PATH)['meta_train']['context']

    # do eval
    all_stats = []
    try:
        alg = joblib.load(osp.join(exp_path, sub_exp, 'best_meta_test.pkl'))['algorithm']
        print('\nLOADED ALGORITHM\n')
        if exp_specs['evaluating_np_airl']:
            alg.cuda()
            alg.main_policy.preprocess_model.cuda()
        else:
            alg.cuda()
    except Exception as e:
コード例 #12
0
ファイル: train_vrnn.py プロジェクト: yifan-you-37/rl_swiss
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    env_specs = {
        'flat_repr': False,
        'one_hot_repr': False,
        'maze_h': 9,
        'maze_w': 9,
        'obs_h': 5,
        'obs_w': 5,
        'scale': 1,
        'num_objs': 10 
    }
    maze_constructor = lambda: PartiallyObservedGrid(env_specs)
    data_loader = VerySpecificOnTheFLyDataLoader(
        maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled())
    val_data_loader = VerySpecificOnTheFLyDataLoader(
        maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    model = NewVRNN(
        next_obs_array[0].shape,
        acts_array[0].shape[0],
        exp_specs['vrnn_specs']['z_dim'],
        exp_specs['vrnn_specs']['x_encoder_specs'],
        exp_specs['vrnn_specs']['lstm_dim'],
        exp_specs['vrnn_specs']['decoder_part_specs'],
    )
    if ptu.gpu_enabled(): model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    MSE_losses = []
    KL_losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                total_ELBO.backward()
                model_optim.step()
                prev_h_batch = prev_h_batch.detach()
                prev_c_batch = prev_c_batch.detach()
                total_ELBO.detach()
        if iter_num % episode_length == 0:
            total_ELBO = 0.
            total_MSE = 0.
            total_KL = 0.
            prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            prev_c_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()
                prev_c_batch = prev_c_batch.cuda()
            
            train_mse_print = '\t'.join(MSE_losses)
            train_kl_print = '\t'.join(KL_losses)
            MSE_losses = []
            KL_losses = []

        obs_batch, act_batch = data_loader.get_next_batch()

        prior_mean, prior_log_cov, post_mean, post_log_cov, cur_z_sample, recon_mean, recon_log_cov, prev_h_batch, prev_c_batch = model(obs_batch, act_batch, prev_h_batch, prev_c_batch)
        elbo, KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True)
        mse = ((recon_mean - obs_batch)**2).mean()

        total_elbo = total_ELBO + elbo
        total_MSE = total_MSE + mse
        MSE_losses.append(mse)
        KL_losses.append(KL)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            model.eval()
            
            val_prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            val_prev_c_batch = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()
                val_prev_c_batch = val_prev_c_batch.cuda()

            val_total_ELBO = 0.
            val_total_KL = 0.
            val_total_MSE = 0.            
            val_MSE_losses = []
            val_KL_losses = []
            prior_imgs = []
            post_imgs = []
            obs_imgs = []
            while val_data_loader.cur_t != val_data_loader.episode_length:
                obs_batch, act_batch = data_loader.get_next_batch()

                prior_mean, prior_log_cov, post_mean, post_log_cov, cur_z_sample, recon_mean, recon_log_cov, val_prev_h_batch, val_prev_c_batch = model(obs_batch, act_batch, val_prev_h_batch, val_prev_c_batch)
                elbo, KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True)
                mse = ((recon_mean - obs_batch)**2).mean()

                val_total_elbo = val_total_ELBO + elbo
                val_total_MSE = val_total_MSE + mse
                val_MSE_losses.append(mse)
                val_KL_losses.append(KL)

                prior_recon_mean, _ = model.get_obs_recon_dist(prior_mean, val_prev_h_batch)
                prior_recon_mean = np.transpose(prior_recon_mean[0].data.cpu().numpy(), (1,2,0))
                prior_imgs.append(prior_recon_mean)

                post_recon_mean, _ = model.get_obs_recon_dist(post_mean, val_prev_h_batch)
                post_recon_mean = np.transpose(post_recon_mean[0].data.cpu().numpy(), (1,2,0))
                post_imgs.append(post_recon_mean)

                obs = np.transpose(obs_batch[0].data.cpu().numpy(), (1,2,0))
                obs_imgs.append(obs)

                post_prior_KL = model.compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov)
                val_elbo, val_KL = model.compute_ELBO(
                    prior_mean, prior_log_cov,
                    post_mean, post_log_cov,
                    recon_mean, recon_log_cov,
                    obs_batch,
                    average_over_batch=True
                )
                val_total_elbo += val_elbo
                val_total_KL += post_prior_KL
                val_mse = ((recon_mean - obs_batch)**2).mean()
                val_total_MSE += val_mse

                val_MSE_losses.append(val_mse)
                val_total_KL.append(val_KL)

            val_mse_print = '\t'.join(val_MSE_losses)
            val_kl_print = '\t'.join(val_KL_losses)
            print('Avg Timestep MSE:\t%.4f' % (val_total_MSE))
            print('Avg Timestep KL:\t%.4f' % (val_total_KL))
            print('MSE:\t%s' % val_mse_print)
            print('KL:\t%s' % val_kl_print)

            # generate the gifs
            generate_gif(
                [prior_imgs, post_imgs, obs_imgs],
                ['Prior', 'Posterior', 'True Obs'],
                'junk_vis/tiny_vrnn/%d.gif' % iter_num
            )
            
            model.train()
コード例 #13
0
def run_trained_policy(path):
    ptu.set_gpu_mode(True)
    variant = json.load(open(osp.join(path, "variant.json"), "r"))
    set_seed(variant["seed"])
    variant = preprocess_variant_llraps(variant)
    env_suite = variant.get("env_suite", "kitchen")
    env_kwargs = variant["env_kwargs"]
    num_low_level_actions_per_primitive = variant[
        "num_low_level_actions_per_primitive"]
    low_level_action_dim = variant["low_level_action_dim"]

    env_name = variant["env_name"]
    make_env_lambda = lambda: make_env(env_suite, env_name, env_kwargs)

    eval_envs = [make_env_lambda() for _ in range(1)]
    eval_env = DummyVecEnv(eval_envs,
                           pass_render_kwargs=variant.get(
                               "pass_render_kwargs", False))

    discrete_continuous_dist = variant["actor_kwargs"][
        "discrete_continuous_dist"]
    num_primitives = eval_envs[0].num_primitives
    continuous_action_dim = eval_envs[0].max_arg_len
    discrete_action_dim = num_primitives
    if not discrete_continuous_dist:
        continuous_action_dim = continuous_action_dim + discrete_action_dim
        discrete_action_dim = 0
    action_dim = continuous_action_dim + discrete_action_dim
    obs_dim = eval_env.observation_space.low.size

    primitive_model = Mlp(
        output_size=variant["low_level_action_dim"],
        input_size=variant["model_kwargs"]["stochastic_state_size"] +
        variant["model_kwargs"]["deterministic_state_size"] +
        eval_env.envs[0].action_space.low.shape[0] + 1,
        hidden_activation=nn.ReLU,
        num_embeddings=eval_envs[0].num_primitives,
        embedding_dim=eval_envs[0].num_primitives,
        embedding_slice=eval_envs[0].num_primitives,
        **variant["primitive_model_kwargs"],
    )

    world_model = LowlevelRAPSWorldModel(
        low_level_action_dim,
        image_shape=eval_envs[0].image_shape,
        primitive_model=primitive_model,
        **variant["model_kwargs"],
    )
    actor = ActorModel(
        variant["model_kwargs"]["model_hidden_size"],
        world_model.feature_size,
        hidden_activation=nn.ELU,
        discrete_action_dim=discrete_action_dim,
        continuous_action_dim=continuous_action_dim,
        **variant["actor_kwargs"],
    )
    actor.load_state_dict(torch.load(osp.join(path, "actor.ptc")))
    world_model.load_state_dict(torch.load(osp.join(path, "world_model.ptc")))

    actor.to(ptu.device)
    world_model.to(ptu.device)

    eval_policy = DreamerLowLevelRAPSPolicy(
        world_model,
        actor,
        obs_dim,
        action_dim,
        num_low_level_actions_per_primitive=num_low_level_actions_per_primitive,
        low_level_action_dim=low_level_action_dim,
        exploration=False,
        expl_amount=0.0,
        discrete_action_dim=discrete_action_dim,
        continuous_action_dim=continuous_action_dim,
        discrete_continuous_dist=discrete_continuous_dist,
    )
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            for step in range(
                    0, variant["algorithm_kwargs"]["max_path_length"] + 1):
                if step == 0:
                    observation = eval_env.envs[0].reset()
                    eval_policy.reset(observation.reshape(1, -1))
                    policy_o = (None, observation.reshape(1, -1))
                    reward = 0
                else:
                    high_level_action, _ = eval_policy.get_action(policy_o, )
                    observation, reward, done, info = eval_env.envs[0].step(
                        high_level_action[0], )
                    low_level_obs = np.expand_dims(
                        np.array(info["low_level_obs"]), 0)
                    low_level_action = np.expand_dims(
                        np.array(info["low_level_action"]), 0)
                    policy_o = (low_level_action, low_level_obs)
    return reward
コード例 #14
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    replay_dict = joblib.load(exp_specs['replay_dict_path'])
    next_obs_array = replay_dict['next_observations']
    acts_array = replay_dict['actions']
    data_loader = BasicDataLoader(next_obs_array[:40000],
                                  acts_array[:40000],
                                  exp_specs['episode_length'],
                                  exp_specs['batch_size'],
                                  use_gpu=ptu.gpu_enabled())
    val_data_loader = BasicDataLoader(next_obs_array[40000:],
                                      acts_array[40000:],
                                      exp_specs['episode_length'],
                                      exp_specs['batch_size'],
                                      use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    conv_encoder = nn.Sequential(
        nn.Conv2d(3, 32, 1, stride=1, padding=0, bias=False),
        nn.BatchNorm2d(32), nn.ReLU(),
        nn.Conv2d(32, 32, 1, stride=1, padding=0, bias=False),
        nn.BatchNorm2d(32), nn.ReLU())
    ae_dim = 128
    z_dim = 128
    pre_gru = nn.Sequential(nn.Linear(288 + z_dim + 4, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU())
    post_fc = nn.Sequential(nn.Linear(ae_dim + 288 + 4, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU())
    post_mean_fc = nn.Linear(ae_dim, z_dim, bias=True)
    post_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True)
    prior_fc = nn.Sequential(nn.Linear(ae_dim + 4, ae_dim, bias=False),
                             nn.BatchNorm1d(ae_dim), nn.ReLU(),
                             nn.Linear(ae_dim, ae_dim, bias=False),
                             nn.BatchNorm1d(ae_dim), nn.ReLU())
    prior_mean_fc = nn.Linear(ae_dim, z_dim, bias=True)
    prior_log_cov_fc = nn.Linear(ae_dim, z_dim, bias=True)
    gru = nn.GRUCell(ae_dim, ae_dim, bias=True)
    fc_decoder = nn.Sequential(
        nn.Linear(ae_dim + z_dim + 4, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, 288, bias=False),
        nn.BatchNorm1d(288),
        nn.ReLU(),
    )
    conv_decoder = nn.Sequential(
        nn.ConvTranspose2d(32,
                           32,
                           1,
                           stride=1,
                           padding=0,
                           output_padding=0,
                           bias=False), nn.BatchNorm2d(32), nn.ReLU(),
        nn.ConvTranspose2d(32,
                           32,
                           1,
                           stride=1,
                           padding=0,
                           output_padding=0,
                           bias=False), nn.BatchNorm2d(32), nn.ReLU(),
        nn.Conv2d(32, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid())
    if ptu.gpu_enabled():
        conv_encoder.cuda()
        pre_gru.cuda()
        post_fc.cuda()
        post_mean_fc.cuda()
        post_log_cov_fc.cuda()
        prior_fc.cuda()
        prior_mean_fc.cuda()
        prior_log_cov_fc.cuda()
        gru.cuda()
        fc_decoder.cuda()
        conv_decoder.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam([
        item for sublist in map(lambda x: list(x.parameters()), [
            pre_gru, conv_encoder, gru, fc_decoder, conv_decoder, post_fc,
            post_log_cov_fc, post_mean_fc, prior_fc, prior_log_cov_fc,
            prior_mean_fc
        ]) for item in sublist
    ],
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    losses = []
    KLs = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss = loss + total_KL
                loss.backward()
                model_optim.step()
            loss = 0
            total_KL = 0
            prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], ae_dim))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()

            if iter_num % exp_specs['freq_val'] == 0:
                train_loss_print = '\t'.join(losses)
                train_KLs_print = '\t'.join(KLs)
            losses = []
            KLs = []

        obs_batch, act_batch = data_loader.get_next_batch()

        enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1)

        hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1))
        post_mean = post_mean_fc(hidden)
        post_log_cov = post_log_cov_fc(hidden)

        hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1))
        prior_mean = prior_mean_fc(hidden)
        prior_log_cov = prior_log_cov_fc(hidden)

        recon = fc_decoder(torch.cat([prev_h_batch, act_batch, post_mean],
                                     1)).view(obs_batch.size(0), 32, 3, 3)
        recon = conv_decoder(recon)

        hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1))
        prev_h_batch = gru(hidden, prev_h_batch)

        KL = compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov)
        if iter_num % episode_length != 0:
            loss = loss + torch.sum(
                (obs_batch.view(obs_batch.size(0), -1) -
                 recon.view(obs_batch.size(0), -1))**2, 1).mean()
            total_KL = total_KL + KL
        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        KLs.append('%.4f' % KL)

        if iter_num % (50 * exp_specs['episode_length']) in range(
                2 * exp_specs['episode_length']):
            save_pytorch_tensor_as_img(
                recon[0].data.cpu(),
                'junk_vis/full_KL_mem_grid_%d_recon.png' % iter_num)
            save_pytorch_tensor_as_img(
                obs_batch[0].data.cpu(),
                'junk_vis/full_KL_mem_grid_%d_obs.png' % iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            list(
                map(lambda x: x.eval(), [
                    pre_gru, conv_encoder, gru, fc_decoder, conv_decoder,
                    post_fc, post_log_cov_fc, post_mean_fc, prior_fc,
                    prior_log_cov_fc, prior_mean_fc
                ]))

            val_prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], ae_dim))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()

            val_losses = []
            val_KLs = []
            for i in range(freq_bptt):
                obs_batch, act_batch = data_loader.get_next_batch()

                enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1)

                hidden = post_fc(torch.cat([prev_h_batch, enc, act_batch], 1))
                post_mean = post_mean_fc(hidden)
                post_log_cov = post_log_cov_fc(hidden)

                hidden = prior_fc(torch.cat([prev_h_batch, act_batch], 1))
                prior_mean = prior_mean_fc(hidden)
                prior_log_cov = prior_log_cov_fc(hidden)

                recon = fc_decoder(
                    torch.cat([prev_h_batch, act_batch, post_mean],
                              1)).view(obs_batch.size(0), 32, 3, 3)
                recon = conv_decoder(recon)

                hidden = pre_gru(torch.cat([enc, post_mean, act_batch], 1))
                prev_h_batch = gru(hidden, prev_h_batch)

                val_losses.append('%.4f' % ((obs_batch - recon)**2).mean())
                val_KL = compute_KL(prior_mean, prior_log_cov, post_mean,
                                    post_log_cov)
                val_KLs.append('%.4f' % val_KL)

            val_loss_print = '\t'.join(val_losses)
            val_KLs_print = '\t'.join(val_KLs)
            print('Val MSE:\t' + val_loss_print)
            print('Train MSE:\t' + train_loss_print)
            print('Val KL:\t\t' + val_KLs_print)
            print('Train KL:\t' + train_KLs_print)

            list(
                map(lambda x: x.train(), [
                    pre_gru, conv_encoder, gru, fc_decoder, conv_decoder,
                    post_fc, post_log_cov_fc, post_mean_fc, prior_fc,
                    prior_log_cov_fc, prior_mean_fc
                ]))
コード例 #15
0
ファイル: sac.py プロジェクト: xtma/dsac
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'],
    )
    algorithm.to(ptu.device)
    algorithm.train()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Soft Actor Critic')
    parser.add_argument('--config',
                        type=str,
                        default="configs/lunarlander.yaml")
    parser.add_argument('--gpu', type=int, default=0, help="using cpu with -1")
    parser.add_argument('--seed', type=int, default=0)
    args = parser.parse_args()
    with open(args.config, 'r', encoding="utf-8") as f:
        variant = yaml.load(f, Loader=yaml.FullLoader)
    variant["seed"] = args.seed
    log_prefix = "_".join(
        ["sac", variant["env"][:-3].lower(),
         str(variant["version"])])
    setup_logger(log_prefix, variant=variant, seed=args.seed)
    if args.gpu >= 0:
        ptu.set_gpu_mode(True, args.gpu)
    set_seed(args.seed)
    experiment(variant)
コード例 #16
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    replay_dict = joblib.load(exp_specs['replay_dict_path'])
    next_obs_array = replay_dict['next_observations']
    acts_array = replay_dict['actions']
    data_loader = BasicDataLoader(next_obs_array[:40000],
                                  acts_array[:40000],
                                  exp_specs['episode_length'],
                                  exp_specs['batch_size'],
                                  use_gpu=ptu.gpu_enabled())
    val_data_loader = BasicDataLoader(next_obs_array[40000:],
                                      acts_array[40000:],
                                      exp_specs['episode_length'],
                                      exp_specs['batch_size'],
                                      use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    ae_dim = 128
    encoder = nn.Sequential(nn.Linear(48, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU())
    gru = nn.GRUCell(ae_dim, ae_dim, bias=True)
    decoder = nn.Sequential(nn.Linear(ae_dim + 4, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, ae_dim, bias=False),
                            nn.BatchNorm1d(ae_dim), nn.ReLU(),
                            nn.Linear(ae_dim, 48), nn.Sigmoid())
    if ptu.gpu_enabled():
        encoder.cuda()
        gru.cuda()
        decoder.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(list(encoder.parameters()) +
                       list(decoder.parameters()) + list(gru.parameters()),
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss.backward()
                model_optim.step()
            loss = 0
            prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], ae_dim))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()

            if iter_num % exp_specs['freq_val'] == 0:
                train_loss_print = '\t'.join(losses)
            losses = []

        obs_batch, act_batch = data_loader.get_next_batch()
        recon = decoder(torch.cat([prev_h_batch, act_batch],
                                  1)).view(obs_batch.size())
        enc = encoder(obs_batch.view(obs_batch.size(0), -1))
        prev_h_batch = gru(enc, prev_h_batch)

        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        if iter_num % freq_bptt != 0:
            loss = loss + (
                (obs_batch - recon)**2).sum() / float(exp_specs['batch_size'])

        if iter_num % 250 in range(10):
            save_pytorch_tensor_as_img(
                recon[0].data.cpu(),
                'junk_vis/with_wd_1e-3_ae_recon_%d.png' % iter_num)
            save_pytorch_tensor_as_img(
                obs_batch[0].data.cpu(),
                'junk_vis/with_wd_1e-3_ae_obs_%d.png' % iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            list(map(lambda x: x.eval(), [encoder, decoder, gru]))

            val_prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], ae_dim))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()

            losses = []
            for i in range(freq_bptt):
                obs_batch, act_batch = val_data_loader.get_next_batch()
                recon = decoder(torch.cat([val_prev_h_batch, act_batch],
                                          1)).view(obs_batch.size())
                enc = encoder(obs_batch.view(obs_batch.size(0), -1))
                val_prev_h_batch = gru(enc, val_prev_h_batch)
                losses.append('%.4f' % ((obs_batch - recon)**2).mean())

            loss_print = '\t'.join(losses)
            print('Val MSE:\t' + loss_print)
            print('Train MSE:\t' + train_loss_print)

            list(map(lambda x: x.train(), [encoder, decoder, gru]))
コード例 #17
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)
    img_save_path = 'junk_vis/debug_more_proper'

    # Prep the data -----------------------------------------------------------
    data_path = 'junk_vis/multi_mnist_data'
    canvas_size = 36
    (X_train, _), (X_test, _) = multi_mnist(data_path,
                                            max_digits=1,
                                            canvas_size=canvas_size,
                                            seed=42,
                                            use_max=True)
    X_train = X_train[:, None, ...]
    X_test = X_test[:, None, ...]
    X_train, X_test = torch.FloatTensor(X_train) / 255.0, torch.FloatTensor(
        X_test) / 255.0

    # np_imgs = np.load('/u/kamyar/dsprites-dataset/dsprites_ndarray_co1sh3sc6or40x32y32_64x64.npz')['imgs']

    # np_imgs = None

    X_train = torch.clamp(X_train, 0.05, 0.95)
    X_test = torch.clamp(X_test, 0.05, 0.95)
    train_ds = TensorDataset(X_train)
    val_ds = TensorDataset(X_test)

    # Model Definition --------------------------------------------------------
    if exp_specs['masked']:
        model = MaskedVAE(
            [1, canvas_size, canvas_size],
            exp_specs['vae_specs']['z_dim'],
            exp_specs['vae_specs']['encoder_specs'],
            exp_specs['vae_specs']['decoder_specs'],
        )
    else:
        model = VAE(
            [1, canvas_size, canvas_size],
            exp_specs['vae_specs']['z_dim'],
            exp_specs['vae_specs']['encoder_specs'],
            exp_specs['vae_specs']['decoder_specs'],
        )
    if ptu.gpu_enabled():
        model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(),
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    global_iter = 0
    for epoch in range(exp_specs['epochs']):
        train_loader = DataLoader(train_ds,
                                  batch_size=exp_specs['batch_size'],
                                  shuffle=True,
                                  num_workers=4,
                                  pin_memory=True,
                                  drop_last=True)
        for iter_num, img_batch in enumerate(train_loader):
            img_batch = img_batch[0]
            if ptu.gpu_enabled(): img_batch = img_batch.cuda()

            z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model(
                img_batch)
            elbo, KL = model.compute_ELBO(z_mean,
                                          z_log_cov,
                                          recon_mean,
                                          recon_log_cov,
                                          img_batch,
                                          average_over_batch=True)
            loss = -1. * elbo
            loss.backward()
            model_optim.step()

            if global_iter % 1000 == 0:
                mse = ((recon_mean - img_batch)**2).mean()
                print('\nTraining Iter %d...' % global_iter)
                print('ELBO:\t%.4f' % elbo)
                print('MSE:\t%.4f' % mse)
                print('KL:\t%.4f' % KL)
                save_pytorch_tensor_as_img(
                    img_batch[0].data.cpu(),
                    os.path.join(img_save_path,
                                 '%d_train_img.png' % (global_iter)))
                save_pytorch_tensor_as_img(
                    recon_mean[0].data.cpu(),
                    os.path.join(img_save_path,
                                 '%d_train_recon.png' % (global_iter)))
                if exp_specs['masked']:
                    save_pytorch_tensor_as_img(
                        enc_mask[0].data.cpu(),
                        os.path.join(img_save_path,
                                     '%d_train_enc_mask.png' % (global_iter)))
                    # save_pytorch_tensor_as_img(dec_mask[0].data.cpu(), os.path.join(img_save_path, '%d_train_dec_mask.png'%(global_iter)))

            if global_iter % exp_specs['freq_val'] == 0:
                with torch.no_grad():
                    print('Validating Iter %d...' % global_iter)
                    model.eval()

                    idxs = np.random.choice(int(X_test.size(0)),
                                            size=exp_specs['batch_size'],
                                            replace=False)
                    img_batch = X_test[idxs]
                    if ptu.gpu_enabled(): img_batch = img_batch.cuda()

                    z_mean, z_log_cov, recon_mean, recon_log_cov, enc_mask, dec_mask = model(
                        img_batch)
                    elbo, KL = model.compute_ELBO(z_mean,
                                                  z_log_cov,
                                                  recon_mean,
                                                  recon_log_cov,
                                                  img_batch,
                                                  average_over_batch=True)
                    mse = ((recon_mean - img_batch)**2).mean()

                    print('ELBO:\t%.4f' % elbo)
                    print('MSE:\t%.4f' % mse)
                    print('KL:\t%.4f' % KL)

                    for i in range(1):
                        save_pytorch_tensor_as_img(
                            img_batch[i].data.cpu(),
                            os.path.join(img_save_path,
                                         '%d_%d_img.png' % (global_iter, i)))
                        save_pytorch_tensor_as_img(
                            recon_mean[i].data.cpu(),
                            os.path.join(img_save_path,
                                         '%d_%d_recon.png' % (global_iter, i)))
                        if exp_specs['masked']:
                            save_pytorch_tensor_as_img(
                                enc_mask[i].data.cpu(),
                                os.path.join(
                                    img_save_path,
                                    '%d_%d_enc_mask.png' % (global_iter, i)))
                            # save_pytorch_tensor_as_img(dec_mask[i].data.cpu(), os.path.join(img_save_path, '%d_%d_dec_mask.png'%(global_iter, i)))

                    model.train()

            global_iter += 1
コード例 #18
0
ファイル: train_ae.py プロジェクト: yifan-you-37/rl_swiss
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    replay_dict = joblib.load(exp_specs['replay_dict_path'])
    next_obs_array = replay_dict['next_observations']
    acts_array = replay_dict['actions']
    data_loader = BasicDataLoader(next_obs_array[:40000],
                                  acts_array[:40000],
                                  exp_specs['episode_length'],
                                  exp_specs['batch_size'],
                                  use_gpu=ptu.gpu_enabled())
    val_data_loader = BasicDataLoader(next_obs_array[40000:],
                                      acts_array[40000:],
                                      exp_specs['episode_length'],
                                      exp_specs['batch_size'],
                                      use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    ae_dim = 128
    model = nn.Sequential(nn.Linear(48, ae_dim, bias=False),
                          nn.BatchNorm1d(ae_dim, affine=False), nn.ReLU(),
                          nn.Linear(ae_dim, ae_dim, bias=False),
                          nn.BatchNorm1d(ae_dim, affine=False), nn.ReLU(),
                          nn.Linear(ae_dim, ae_dim, bias=False),
                          nn.BatchNorm1d(ae_dim, affine=False), nn.ReLU(),
                          nn.Linear(ae_dim, 48), nn.Sigmoid())
    if ptu.gpu_enabled(): model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(),
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                loss.backward()
                model_optim.step()
            loss = 0

        obs_batch, act_batch = data_loader.get_next_batch()
        recon = model(obs_batch.view(obs_batch.size(0),
                                     -1)).view(obs_batch.size())
        loss = loss + (
            (obs_batch - recon)**2).sum() / float(exp_specs['batch_size'])

        if iter_num % 50 == 0:
            save_pytorch_tensor_as_img(recon[0].data.cpu(),
                                       'junk_vis/ae_recon_%d.png' % iter_num)
            save_pytorch_tensor_as_img(obs_batch[0].data.cpu(),
                                       'junk_vis/ae_obs_%d.png' % iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            model.eval()

            obs_batch, act_batch = val_data_loader.get_next_batch()
            recon = model(obs_batch.view(obs_batch.size(0),
                                         -1)).view(obs_batch.size())

            print('MSE:\t%.4f' % ((obs_batch - recon)**2).mean())

            model.train()
コード例 #19
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    env_specs = {
        'flat_repr': False,
        'one_hot_repr': False,
        'maze_h': 9,
        'maze_w': 9,
        'obs_h': 5,
        'obs_w': 5,
        'scale': 4,
        'num_objs': 10 
    }
    maze_constructor = lambda: PartiallyObservedGrid(env_specs)
    data_loader = VerySpecificOnTheFLyDataLoader(
        maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled())
    val_data_loader = VerySpecificOnTheFLyDataLoader(
        maze_constructor, exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    model = NewVRNN(
        [3, env_specs['obs_h']*env_specs['scale'], env_specs['obs_w']*env_specs['scale']],
        exp_specs['vrnn_specs']['act_proc_dim'],
        exp_specs['vrnn_specs']['z_dim'],
        exp_specs['vrnn_specs']['pre_post_gru_dim'],
        exp_specs['vrnn_specs']['x_encoder_specs'],
        exp_specs['vrnn_specs']['decoder_part_specs'],
    )
    if ptu.gpu_enabled():
        model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(), lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    MSE_losses = []
    KL_losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss = -1. * total_ELBO
                loss.backward()
                model_optim.step()
                prev_z.detach()
                total_ELBO.detach()
        if iter_num % episode_length == 0:
            total_ELBO = 0.
            total_MSE = 0.
            total_KL = 0.
            prev_z = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim))
            if ptu.gpu_enabled():
                prev_z = prev_z.cuda()
            
            train_mse_print = '\t'.join(MSE_losses)
            train_kl_print = '\t'.join(KL_losses)
            MSE_losses = []
            KL_losses = []

        obs_batch, act_batch = data_loader.get_next_batch()

        prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, prev_z = model(obs_batch, act_batch, prev_z)
        elbo, KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True)
        mse = ((recon_mean - obs_batch)**2).mean()

        # # Trying something with the prior
        # eps = Variable(torch.randn(prior_mean.size()))
        # if prior_mean.is_cuda: eps = eps.cuda()
        # prior_z_sample = prior_mean + eps*torch.exp(0.5 * prior_log_cov)
        # prior_recon_mean, _ = model.get_obs_recon_dist(prior_z_sample, act_batch)
        # prior_recon_log_prob = -0.5 * torch.sum((prior_recon_mean - obs_batch)**2) / float(obs_batch.size(0))
        # elbo = elbo + prior_recon_log_prob

        # save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/tiny_vrnn_larger_cov_range_1_KL/obs_%d.png' % iter_num)

        MSE_losses.append('%.4f' % mse)
        KL_losses.append('%.4f' % KL)
        if iter_num % episode_length != 0:
            total_ELBO = total_ELBO + elbo
            total_MSE = total_MSE + mse

        if iter_num % exp_specs['freq_val'] == 0:
            with torch.no_grad():
                print('\nValidating Iter %d...' % iter_num)
                model.eval()
                
                val_prev_z = Variable(torch.zeros(exp_specs['batch_size'], model.lstm_dim))
                if ptu.gpu_enabled():
                    val_prev_z = val_prev_z.cuda()

                val_total_ELBO = 0.
                val_total_KL = 0.
                val_total_MSE = 0.            
                val_total_prior_MSE = 0.
                val_MSE_losses = []
                val_prior_MSE_losses = []
                val_KL_losses = []
                prior_imgs = []
                post_imgs = []
                post_sample_imgs = []
                obs_imgs = []
                for _ in range(episode_length):
                    obs_batch, act_batch = val_data_loader.get_next_batch()
                    # save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/tiny_vrnn_larger_cov_range_1_KL/val_obs_%d.png' % i)

                    prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, val_prev_z = model(obs_batch, act_batch, val_prev_z)
                    val_elbo, val_KL = model.compute_ELBO(prior_mean, prior_log_cov, post_mean, post_log_cov, recon_mean, recon_log_cov, obs_batch, average_over_batch=True)
                    val_mse = ((recon_mean - obs_batch)**2).mean()

                    print('Mean:')
                    print(torch.exp(post_mean)[0,:8].data.cpu().numpy())
                    print(torch.exp(prior_mean)[0,:8].data.cpu().numpy())
                    print('-----')
                    print('Cov')
                    print(torch.exp(post_log_cov)[0,:8].data.cpu().numpy())
                    print(torch.exp(prior_log_cov)[0,:8].data.cpu().numpy())
                    print('-----------------------')

                    val_total_elbo = val_total_ELBO + val_elbo
                    val_total_MSE = val_total_MSE + val_mse
                    
                    val_MSE_losses.append('%.4f' % val_mse)
                    val_KL_losses.append('%.4f' % val_KL)

                    prior_recon_mean, _ = model.get_obs_recon_dist(prior_mean, act_batch)
                    val_prior_mse = ((prior_recon_mean - obs_batch)**2).mean()
                    val_total_prior_MSE = val_total_prior_MSE + val_prior_mse
                    val_prior_MSE_losses.append('%.4f' % val_prior_mse)
                    prior_recon_mean = np.transpose(prior_recon_mean[0].data.cpu().numpy(), (1,2,0))
                    prior_imgs.append(prior_recon_mean)

                    post_recon_mean, _ = model.get_obs_recon_dist(post_mean, act_batch)
                    post_recon_mean = np.transpose(post_recon_mean[0].data.cpu().numpy(), (1,2,0))
                    post_imgs.append(post_recon_mean)

                    sample_recon_mean = recon_mean
                    sample_recon_mean = np.transpose(sample_recon_mean[0].data.cpu().numpy(), (1,2,0))
                    post_sample_imgs.append(sample_recon_mean)

                    obs = np.transpose(obs_batch[0].data.cpu().numpy(), (1,2,0))
                    obs_imgs.append(obs)

                    post_prior_KL = model.compute_KL(prior_mean, prior_log_cov, post_mean, post_log_cov)
                    val_elbo, val_KL = model.compute_ELBO(
                        prior_mean, prior_log_cov,
                        post_mean, post_log_cov,
                        recon_mean, recon_log_cov,
                        obs_batch,
                        average_over_batch=True
                    )

                val_mse_print = '\t'.join(val_MSE_losses)
                val_prior_mse_print = '\t'.join(val_prior_MSE_losses)
                val_kl_print = '\t'.join(val_KL_losses)
                print('Avg Timestep MSE:\t\t%.4f' % (val_total_MSE))
                print('Avg Timestep Prior MSE:\t%.4f' % (val_total_prior_MSE))
                print('Avg Timestep KL:\t\t%.4f' % (val_total_KL))
                print('MSE:\t\t%s' % val_mse_print)
                print('Prior MSE:\t%s' % val_prior_mse_print)
                print('KL:\t\t%s' % val_kl_print)

                # generate the gifs
                generate_gif(
                    [post_sample_imgs, prior_imgs, post_imgs, obs_imgs],
                    ['Posterior Sample', 'Prior', 'Posterior', 'True Obs'],
                    'junk_vis/vrnn_kl_0/%d.gif' % iter_num
                )
                
                model.train()
コード例 #20
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    replay_dict = joblib.load(exp_specs['replay_dict_path'])
    next_obs_array = replay_dict['next_observations']
    acts_array = replay_dict['actions']
    data_loader = RandomDataLoader(next_obs_array[:4000],
                                   acts_array[:4000],
                                   use_gpu=ptu.gpu_enabled())
    val_data_loader = RandomDataLoader(next_obs_array[4000:],
                                       acts_array[4000:],
                                       use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    if exp_specs['use_masked_vae']:
        model = VAESeg()
    else:
        model = VAE()
    if ptu.gpu_enabled(): model.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(model.parameters(),
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    for iter_num in range(int(float(exp_specs['max_iters']))):
        obs_batch, act_batch = data_loader.get_next_batch(
            exp_specs['batch_size'])
        if exp_specs['use_masked_vae']:
            recon_mean, recon_log_cov, z_mean, z_log_cov, mask = model(
                obs_batch)
        else:
            recon_mean, recon_log_cov, z_mean, z_log_cov = model(obs_batch)
        elbo = model.compute_ELBO(z_mean, z_log_cov, recon_mean, recon_log_cov,
                                  obs_batch)
        KL = model.compute_KL(z_mean, z_log_cov)
        neg_elbo = -1. * elbo
        neg_elbo.backward()
        model_optim.step()

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            model.eval()

            obs_batch, act_batch = val_data_loader.get_next_batch(
                exp_specs['batch_size'])
            if exp_specs['use_masked_vae']:
                recon_mean, recon_log_cov, z_mean, z_log_cov, mask = model(
                    obs_batch)
                mask = mask.repeat(1, 3, 1, 1)
                save_pytorch_tensor_as_img(
                    mask[0].data.cpu(), 'junk_vis/mask_vae_%d.png' % iter_num)
            else:
                recon_mean, recon_log_cov, z_mean, z_log_cov = model(obs_batch)
            elbo = model.compute_ELBO(z_mean, z_log_cov, recon_mean,
                                      recon_log_cov, obs_batch)
            KL = model.compute_KL(z_mean, z_log_cov)

            print('\nELBO:\t%.4f' % elbo)
            print('KL:\t%.4f' % KL)
            print('MSE:\t%.4f' % ((recon_mean - obs_batch)**2).mean())
            print(obs_batch[0][0, :4, :4])
            print(recon_mean[0][0, :4, :4])
            print(recon_log_cov[0][0, :4, :4])
            print(z_mean[0, 1])
            print(torch.exp(z_log_cov[0, 1]))

            save_pytorch_tensor_as_img(recon_mean[0].data.cpu(),
                                       'junk_vis/recon_vae_%d.png' % iter_num)
            save_pytorch_tensor_as_img(obs_batch[0].data.cpu(),
                                       'junk_vis/obs_vae_%d.png' % iter_num)

            model.train()
コード例 #21
0
ファイル: sl_bptt_exp.py プロジェクト: jcoreyes/erl
def main():
    n_seeds = 1
    mode = "here"
    exp_prefix = "dev-sl"

    # n_seeds = 10
    # mode = "ec2"
    # exp_prefix = "6-2-sl-rwa-vs-lstm"

    env_noise_std = 0
    memory_noise_std = 0
    for rnn_cell_class, H in product(
        [SeparateRWALinearCell],
        [512],
            # [RWACell, LSTMCell, GRUCell],
            # [512, 256, 128, 64],
    ):
        # noinspection PyTypeChecker
        variant = dict(
            H=H,
            exp_prefix=exp_prefix,
            algo_params=dict(
                num_batches_per_epoch=10000 // 32,
                num_epochs=100,
                learning_rate=1e-3,
                batch_size=32,
                eval_num_episodes=64,
                lstm_state_size=10,
                rnn_cell_class=rnn_cell_class,
                rnn_cell_params=dict(
                    # use_peepholes=True,
                    state_is_flat_externally=False,
                    output_dim=1,
                ),
                # rnn_cell_class=SeparateLstmLinearCell,
                # rnn_cell_params=dict(
                #     use_peepholes=True,
                #     env_noise_std=env_noise_std,
                #     memory_noise_std=memory_noise_std,
                #     output_nonlinearity=tf.nn.tanh,
                #     # output_nonlinearity=tf.nn.softmax,
                #     env_hidden_sizes=[],
                # ),
                softmax=False,
            ),
            version='Supervised Learning',
            env_class=HighLow,
            # env_class=OneCharMemory,
        )

        exp_id = -1
        for seed in range(n_seeds):
            exp_id += 1
            set_seed(seed)
            variant['seed'] = seed
            variant['exp_id'] = exp_id

            run_experiment(
                bptt_launcher,
                exp_prefix=exp_prefix,
                seed=seed,
                mode=mode,
                variant=variant,
                exp_id=exp_id,
            )
コード例 #22
0
def experiment(variant, seed):
    exp_id = "{}_{}".format(VARIANT_NAME, seed)
    print("\nExperiment: {}\nTask distribution: {}\n".format(
        exp_id, variant['env_name']))

    # Randomization seed for reproducibility
    set_seed(seed)

    # create multi-task environment and sample tasks
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    target_entropy = -action_dim
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    algorithm = PEARLSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:variant['n_train_tasks']]),
        eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
        nets=[agent, qf1, qf2],
        latent_dim=latent_dim,
        target_entropy=target_entropy,
        **variant['algo_params'])

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_qf1.pth')))
        algorithm.networks[-1].load_state_dict(
            torch.load(os.path.join(path, 'target_qf2.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    experiment_log_dir = setup_logger(
        variant['env_name'],
        tasks={i: env.tasks[i]
               for i in range(len(env.tasks))},
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
コード例 #23
0
def experiment(variant):

    domain = variant['domain']
    seed = variant['seed']
    exp_mode = variant['exp_mode']
    max_path_length = variant['algo_params']['max_path_length']
    bcq_interactions = variant['bcq_interactions']
    num_tasks = variant['num_tasks']

    filename = f'./goals/{domain}-{exp_mode}-goals.pkl'
    idx_list, train_goals, wd_goals, ood_goals = pickle.load(
        open(filename, 'rb'))
    idx_list = idx_list[:num_tasks]

    sub_buffer_dir = f"buffers/{domain}/{exp_mode}/max_path_length_{max_path_length}/interactions_{bcq_interactions}k/seed_{seed}"
    buffer_dir = os.path.join(variant['data_models_root'], sub_buffer_dir)

    print("Buffer directory: " + buffer_dir)

    # Load buffer
    bcq_buffers = []

    buffer_loader_id_list = []
    for i, idx in enumerate(idx_list):
        bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl'
        filename = os.path.join(buffer_dir, bname)
        rp_buffer = ReplayBuffer.remote(
            index=i,
            seed=seed,
            num_trans_context=variant['num_trans_context'],
            in_mdp_batch_size=variant['in_mdp_batch_size'],
        )

        buffer_loader_id_list.append(rp_buffer.load_from_gzip.remote(filename))
        bcq_buffers.append(rp_buffer)
    ray.get(buffer_loader_id_list)

    assert len(bcq_buffers) == len(idx_list)

    train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, )

    set_seed(variant['seed'])

    # create multi-task environment and sample tasks
    env = env_producer(variant['domain'], seed=0)
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    algorithm = PEARLSoftActorCritic(env=env,
                                     train_goals=train_goals,
                                     wd_goals=wd_goals,
                                     ood_goals=ood_goals,
                                     replay_buffers=train_buffer,
                                     nets=[agent, qf1, qf2, vf],
                                     latent_dim=latent_dim,
                                     **variant['algo_params'])

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['domain'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
コード例 #24
0
def experiment(variant, args):
    # expl_env = NormalizedBoxEnv(gym.make(str(args.env)))
    # eval_env = NormalizedBoxEnv(gym.make(str(args.env)))
    expl_env = NormalizedBoxEnv(Mani2dEnv())
    eval_env = NormalizedBoxEnv(Mani2dEnv())

    setup_logger('DIAYNMUSIC_' + str(args.skill_dim) + '_' + args.env,
                 variant=variant,
                 snapshot_mode="last")
    ptu.set_gpu_mode(True)  # optionally set the GPU (default=False)
    set_seed(args.seed)

    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    skill_dim = args.skill_dim

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + skill_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    df = FlattenMlp(
        input_size=obs_dim,
        output_size=skill_dim,
        hidden_sizes=[M, M],
    )
    # smile estimator
    mi_etimator = ConcatCritic(obs_dim, M, 2, "relu")
    smile_clip = 1.0

    policy = SkillTanhGaussianPolicy(obs_dim=obs_dim + skill_dim,
                                     action_dim=action_dim,
                                     hidden_sizes=[M, M],
                                     skill_dim=skill_dim)
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = DIAYNMdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_step_collector = MdpStepCollector(
        expl_env,
        policy,
    )
    replay_buffer = DIAYNEnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
        skill_dim,
    )
    trainer = DIAYNMUSICTrainer(env=eval_env,
                                policy=policy,
                                qf1=qf1,
                                qf2=qf2,
                                df=df,
                                target_qf1=target_qf1,
                                target_qf2=target_qf2,
                                mi_estimator=mi_etimator,
                                smile_clip=smile_clip,
                                prio_extrio_bound=6,
                                **variant['trainer_kwargs'])
    algorithm = DIAYNTorchOnlineRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_step_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
コード例 #25
0
ファイル: rl_algorithm.py プロジェクト: Knoxantropicen/rlkit
def take_step_in_env_per_thread(pid, queue, env, policy, render, reward_scale,
                                steps, max_path_length, n_env_steps_total):
    set_seed(pid)
    n_rollouts_total = 0
    current_path_builder = PathBuilder()
    exploration_paths = []
    replay_samples = {
        'observations': [],
        'actions': [],
        'rewards': [],
        'next_observations': [],
        'terminals': [],
        'agent_infos': [],
        'env_infos': [],
    }

    policy.reset()
    observation = env.reset()
    policy.set_num_steps_total(n_env_steps_total)

    for _ in range(steps):

        action, agent_info = policy.get_action(observation)
        if pid == 0 and render:
            env.render()
        next_ob, raw_reward, terminal, env_info = env.step(action)
        reward = np.array([raw_reward * reward_scale])
        terminal = np.array([terminal])

        replay_samples['observations'].append(observation)
        replay_samples['actions'].append(action)
        replay_samples['rewards'].append(reward)
        replay_samples['next_observations'].append(next_ob)
        replay_samples['terminals'].append(terminal)
        replay_samples['agent_infos'].append(agent_info)
        replay_samples['env_infos'].append(env_info)

        current_path_builder.add_all(
            observations=observation,
            actions=action,
            rewards=reward,
            next_observations=next_ob,
            terminals=terminal,
            agent_infos=agent_info,
            env_infos=env_info,
        )

        if terminal or len(current_path_builder) >= max_path_length:
            # cannot let replay buffer terminate episode
            n_rollouts_total += 1
            if len(current_path_builder) > 0:
                exploration_paths.append(
                    current_path_builder.get_all_stacked())
                current_path_builder = PathBuilder()
            policy.reset()
            observation = env.reset()
        else:
            observation = next_ob

    if queue is None:
        return exploration_paths, replay_samples, n_rollouts_total
    else:
        queue.put([pid, exploration_paths, replay_samples, n_rollouts_total])
コード例 #26
0
ファイル: conv_sqiar.py プロジェクト: yifan-you-37/rl_swiss
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    env_specs = {
        'flat_repr': False,
        'one_hot_repr': False,
        'maze_h': 9,
        'maze_w': 9,
        'obs_h': 5,
        'obs_w': 5,
        'scale': 4,
        'num_objs': 10
    }
    maze_constructor = lambda: PartiallyObservedGrid(env_specs)
    data_loader = VerySpecificOnTheFLyDataLoader(maze_constructor,
                                                 exp_specs['episode_length'],
                                                 exp_specs['batch_size'],
                                                 use_gpu=ptu.gpu_enabled())
    val_data_loader = VerySpecificOnTheFLyDataLoader(
        maze_constructor,
        exp_specs['episode_length'],
        exp_specs['batch_size'],
        use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    conv_channels = 32
    conv_encoder = nn.Sequential(
        nn.Conv2d(3, conv_channels, 4, stride=2, padding=1, bias=False),
        nn.BatchNorm2d(conv_channels), nn.ReLU(),
        nn.Conv2d(conv_channels,
                  conv_channels,
                  4,
                  stride=2,
                  padding=1,
                  bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(),
        nn.Conv2d(conv_channels,
                  conv_channels,
                  3,
                  stride=1,
                  padding=1,
                  bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU(),
        nn.Conv2d(conv_channels,
                  conv_channels,
                  3,
                  stride=1,
                  padding=1,
                  bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU())
    ae_dim = 256
    gru_dim = 512
    img_h = 5
    flat_inter_img_dim = img_h * img_h * conv_channels
    act_dim = 64
    act_proc = nn.Linear(4, act_dim, bias=True)
    fc_encoder = nn.Sequential(
        nn.Linear(flat_inter_img_dim + act_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        # nn.Linear(ae_dim, ae_dim, bias=False),
        # nn.BatchNorm1d(ae_dim),
        # nn.ReLU(),
        # nn.Linear(ae_dim, ae_dim, bias=False),
        # nn.BatchNorm1d(ae_dim),
        # nn.ReLU(),
        # nn.Linear(ae_dim, ae_dim, bias=False),
        # nn.BatchNorm1d(ae_dim),
        # nn.ReLU()
    )
    gru = nn.LSTMCell(ae_dim, gru_dim, bias=True)
    fc_decoder = nn.Sequential(
        nn.Linear(gru_dim + act_dim, 256, bias=False),
        nn.BatchNorm1d(256),
        nn.ReLU(),
        nn.Linear(256, 2 * flat_inter_img_dim, bias=False),
        nn.BatchNorm1d(2 * flat_inter_img_dim),
        nn.ReLU(),
        # # nn.Linear(ae_dim, ae_dim, bias=False),
        # # nn.BatchNorm1d(ae_dim),
        # # nn.ReLU(),
        # # nn.Linear(ae_dim, ae_dim, bias=False),
        # # nn.BatchNorm1d(ae_dim),
        # # nn.ReLU(),
        # nn.Linear(ae_dim, flat_inter_img_dim, bias=False),
        # nn.BatchNorm1d(flat_inter_img_dim),
        # nn.ReLU(),
    )
    conv_decoder = nn.Sequential(
        nn.Conv2d(64, 64, 3, stride=1, padding=1, bias=False),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.ConvTranspose2d(64,
                           64,
                           4,
                           stride=2,
                           padding=1,
                           output_padding=0,
                           bias=False),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        nn.ConvTranspose2d(64,
                           64,
                           4,
                           stride=2,
                           padding=1,
                           output_padding=0,
                           bias=False),
        nn.BatchNorm2d(64),
        nn.ReLU(),
        # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False),
        # nn.BatchNorm2d(conv_channels),
        # nn.ReLU(),
    )
    mean_decoder = nn.Sequential(
        nn.Conv2d(64, 3, 1, stride=1, padding=0, bias=True), nn.Sigmoid())
    log_cov_decoder = nn.Sequential(
        nn.Conv2d(64, 3, 1, stride=1, padding=0, bias=True), )
    if ptu.gpu_enabled():
        conv_encoder.cuda()
        fc_encoder.cuda()
        gru.cuda()
        fc_decoder.cuda()
        conv_decoder.cuda()
        mean_decoder.cuda()
        log_cov_decoder.cuda()
        act_proc.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam([
        item for sublist in map(lambda x: list(x.parameters()), [
            fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder,
            mean_decoder, log_cov_decoder
        ]) for item in sublist
    ],
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss.backward()
                model_optim.step()
                prev_h_batch = prev_h_batch.detach()
                prev_c_batch = prev_c_batch.detach()
            loss = 0
        if iter_num % episode_length == 0:
            prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_dim))
            prev_c_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_dim))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()
                prev_c_batch = prev_c_batch.cuda()

            train_loss_print = '\t'.join(losses)
            losses = []

        obs_batch, act_batch = data_loader.get_next_batch()
        act_batch = act_proc(act_batch)

        hidden = fc_decoder(torch.cat([prev_h_batch, act_batch],
                                      1)).view(obs_batch.size(0), 64, img_h,
                                               img_h)
        hidden = conv_decoder(hidden)
        recon = mean_decoder(hidden)
        log_cov = log_cov_decoder(hidden)
        log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX)

        enc = conv_encoder(obs_batch)
        enc = enc.view(obs_batch.size(0), -1)
        enc = fc_encoder(torch.cat([enc, act_batch], 1))
        prev_h_batch, prev_c_batch = gru(enc, (prev_h_batch, prev_c_batch))

        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        if iter_num % episode_length != 0:
            loss = loss + (
                (obs_batch - recon)**2).sum() / float(exp_specs['batch_size'])
            # loss = loss - compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])

        if iter_num % (500 * episode_length) in range(2 * episode_length):
            save_pytorch_tensor_as_img(
                recon[0].data.cpu(),
                'junk_vis/debug_2_good_acts_on_the_fly_pogrid_len_8_scale_4/rnn_recon_%d.png'
                % iter_num)
            save_pytorch_tensor_as_img(
                obs_batch[0].data.cpu(),
                'junk_vis/debug_2_good_acts_on_the_fly_pogrid_len_8_scale_4/rnn_obs_%d.png'
                % iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            list(
                map(lambda x: x.eval(), [
                    fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder,
                    mean_decoder, log_cov_decoder, act_proc
                ]))

            val_prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_dim))
            val_prev_c_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_dim))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()
                val_prev_c_batch = val_prev_c_batch.cuda()

            losses = []
            for i in range(episode_length):
                obs_batch, act_batch = val_data_loader.get_next_batch()
                act_batch = act_proc(act_batch)

                hidden = fc_decoder(torch.cat([val_prev_h_batch, act_batch],
                                              1)).view(obs_batch.size(0), 64,
                                                       img_h, img_h)
                hidden = conv_decoder(hidden)
                recon = mean_decoder(hidden)
                log_cov = log_cov_decoder(hidden)
                log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX)

                enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1)
                enc = fc_encoder(torch.cat([enc, act_batch], 1))
                val_prev_h_batch, val_prev_c_batch = gru(
                    enc, (val_prev_h_batch, val_prev_c_batch))

                # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])
                losses.append('%.4f' % ((obs_batch - recon)**2).mean())

            loss_print = '\t'.join(losses)
            print('Val MSE:\t' + loss_print)
            print('Train MSE:\t' + train_loss_print)

            list(
                map(lambda x: x.train(), [
                    fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder,
                    mean_decoder, log_cov_decoder, act_proc
                ]))
コード例 #27
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    replay_dict = joblib.load(exp_specs['replay_dict_path'])
    next_obs_array = replay_dict['next_observations']
    acts_array = replay_dict['actions']
    data_loader = BasicDataLoader(
        next_obs_array[:40000], acts_array[:40000], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled())
    val_data_loader = BasicDataLoader(
        next_obs_array[40000:], acts_array[40000:], exp_specs['episode_length'], exp_specs['batch_size'], use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    conv_channels = 64
    conv_encoder = nn.Sequential(
        nn.Conv2d(3, conv_channels, 1, stride=1, padding=0, bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
        nn.Conv2d(conv_channels, conv_channels, 1, stride=1, padding=0, bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU()
    )
    ae_dim = 128
    gru_dim = 512
    img_h = 5
    flat_inter_img_dim = img_h * img_h * conv_channels
    fc_encoder = nn.Sequential(
        nn.Linear(flat_inter_img_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU()
    )
    gru = nn.GRUCell(
        ae_dim, gru_dim, bias=True
    )
    fc_decoder = nn.Sequential(
        nn.Linear(gru_dim + 4, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, ae_dim, bias=False),
        nn.BatchNorm1d(ae_dim),
        nn.ReLU(),
        nn.Linear(ae_dim, flat_inter_img_dim, bias=False),
        nn.BatchNorm1d(flat_inter_img_dim),
        nn.ReLU(),
    )
    conv_decoder = nn.Sequential(
        nn.ConvTranspose2d(conv_channels, conv_channels, 1, stride=1, padding=0, output_padding=0, bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
        nn.ConvTranspose2d(conv_channels, conv_channels, 1, stride=1, padding=0, output_padding=0, bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
        nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True),
        nn.Sigmoid()
    )
    if ptu.gpu_enabled():
        conv_encoder.cuda()
        fc_encoder.cuda()
        gru.cuda()
        fc_decoder.cuda()
        conv_decoder.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam(
        [
            item for sublist in
            map(
                lambda x: list(x.parameters()),
                [fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder]
            )
            for item in sublist
        ],
        lr=float(exp_specs['model_lr']), weight_decay=float(exp_specs['model_wd'])
    )

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss.backward()
                model_optim.step()
                prev_h_batch = prev_h_batch.detach()
            loss = 0
        if iter_num % episode_length == 0:
            prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], gru_dim))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()
            
            if iter_num % exp_specs['freq_val'] == 0:
                train_loss_print = '\t'.join(losses)
            losses = []

        obs_batch, act_batch = data_loader.get_next_batch()
        recon = fc_decoder(torch.cat([prev_h_batch, act_batch], 1)).view(obs_batch.size(0), conv_channels, img_h, img_h)
        recon = conv_decoder(recon)

        enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1)
        enc = fc_encoder(enc)
        prev_h_batch = gru(enc, prev_h_batch)

        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        if iter_num % episode_length != 0:
            loss = loss + ((obs_batch - recon)**2).sum()/float(exp_specs['batch_size'])

        if iter_num % (50*episode_length) in range(2*episode_length):
            save_pytorch_tensor_as_img(recon[0].data.cpu(), 'junk_vis/fixed_colors_simple_maze_5_h/rnn_recon_%d.png' % iter_num)
            save_pytorch_tensor_as_img(obs_batch[0].data.cpu(), 'junk_vis/fixed_colors_simple_maze_5_h/rnn_obs_%d.png' % iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            list(map(lambda x: x.eval(), [fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder]))

            val_prev_h_batch = Variable(torch.zeros(exp_specs['batch_size'], gru_dim))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()

            losses = []            
            for i in range(episode_length):
                obs_batch, act_batch = data_loader.get_next_batch()
                
                recon = fc_decoder(torch.cat([val_prev_h_batch, act_batch], 1)).view(obs_batch.size(0), conv_channels, img_h, img_h)
                recon = conv_decoder(recon)

                enc = conv_encoder(obs_batch).view(obs_batch.size(0), -1)
                enc = fc_encoder(enc)
                val_prev_h_batch = gru(enc, val_prev_h_batch)

                losses.append('%.4f' % ((obs_batch - recon)**2).mean())

            loss_print = '\t'.join(losses)
            print('Val MSE:\t' + loss_print)
            print('Train MSE:\t' + train_loss_print)

            list(map(lambda x: x.train(), [fc_encoder, conv_encoder, gru, fc_decoder, conv_decoder]))            
コード例 #28
0
def experiment(exp_specs):
    ptu.set_gpu_mode(exp_specs['use_gpu'])
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Prep the data -----------------------------------------------------------
    replay_dict = joblib.load(exp_specs['replay_dict_path'])
    next_obs_array = replay_dict['next_observations']
    acts_array = replay_dict['actions']
    data_loader = BasicDataLoader(next_obs_array[:40000],
                                  acts_array[:40000],
                                  exp_specs['episode_length'],
                                  exp_specs['batch_size'],
                                  use_gpu=ptu.gpu_enabled())
    val_data_loader = BasicDataLoader(next_obs_array[40000:],
                                      acts_array[40000:],
                                      exp_specs['episode_length'],
                                      exp_specs['batch_size'],
                                      use_gpu=ptu.gpu_enabled())

    # Model Definition --------------------------------------------------------
    conv_channels = 32
    conv_encoder = nn.Sequential(
        nn.Conv2d(3, conv_channels, 4, stride=2, padding=1, bias=False),
        nn.BatchNorm2d(conv_channels), nn.ReLU(),
        nn.Conv2d(conv_channels,
                  conv_channels,
                  4,
                  stride=2,
                  padding=1,
                  bias=False), nn.BatchNorm2d(conv_channels), nn.ReLU())
    gru_channels = 128
    inter_h = 5
    act_channels = 4
    act_proc = nn.Linear(4, act_channels * inter_h * inter_h, bias=True)
    pre_gru_conv = nn.Sequential(
        nn.Conv2d(act_channels + conv_channels,
                  conv_channels,
                  3,
                  stride=1,
                  padding=1,
                  bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
    )
    gru = ConvGRUCell(conv_channels, gru_channels, 3)
    post_gru_conv = nn.Sequential(
        nn.Conv2d(act_channels + gru_channels,
                  conv_channels,
                  3,
                  stride=1,
                  padding=1,
                  bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
    )
    conv_decoder = nn.Sequential(
        nn.ConvTranspose2d(conv_channels,
                           conv_channels,
                           4,
                           stride=2,
                           padding=1,
                           output_padding=0,
                           bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
        # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False),
        # nn.BatchNorm2d(conv_channels),
        # nn.ReLU(),
        nn.ConvTranspose2d(conv_channels,
                           conv_channels,
                           4,
                           stride=2,
                           padding=1,
                           output_padding=0,
                           bias=False),
        nn.BatchNorm2d(conv_channels),
        nn.ReLU(),
        # nn.Conv2d(conv_channels, conv_channels, 3, stride=1, padding=1, bias=False),
        # nn.BatchNorm2d(conv_channels),
        # nn.ReLU(),
    )
    mean_decoder = nn.Sequential(
        nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True),
        nn.Sigmoid())
    log_cov_decoder = nn.Sequential(
        nn.Conv2d(conv_channels, 3, 1, stride=1, padding=0, bias=True), )
    if ptu.gpu_enabled():
        conv_encoder.cuda()
        pre_gru_conv.cuda()
        gru.cuda()
        post_gru_conv.cuda()
        conv_decoder.cuda()
        mean_decoder.cuda()
        log_cov_decoder.cuda()
        act_proc.cuda()

    # Optimizer ---------------------------------------------------------------
    model_optim = Adam([
        item for sublist in map(lambda x: list(x.parameters()), [
            conv_encoder, pre_gru_conv, gru, post_gru_conv, conv_decoder,
            mean_decoder, log_cov_decoder
        ]) for item in sublist
    ],
                       lr=float(exp_specs['model_lr']),
                       weight_decay=float(exp_specs['model_wd']))

    # -------------------------------------------------------------------------
    freq_bptt = exp_specs['freq_bptt']
    episode_length = exp_specs['episode_length']
    losses = []
    for iter_num in range(int(float(exp_specs['max_iters']))):
        if iter_num % freq_bptt == 0:
            if iter_num > 0:
                # loss = loss / freq_bptt
                loss.backward()
                model_optim.step()
                prev_h_batch = prev_h_batch.detach()
            loss = 0
        if iter_num % episode_length == 0:
            prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_channels, inter_h,
                            inter_h))
            if ptu.gpu_enabled():
                prev_h_batch = prev_h_batch.cuda()

            train_loss_print = '\t'.join(losses)
            losses = []

        obs_batch, act_batch = data_loader.get_next_batch()
        act_batch = act_proc(act_batch).view(act_batch.size(0), act_channels,
                                             inter_h, inter_h)

        hidden = post_gru_conv(torch.cat([prev_h_batch, act_batch], 1))
        hidden = conv_decoder(hidden)
        recon = mean_decoder(hidden)
        log_cov = log_cov_decoder(hidden)
        log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX)

        enc = conv_encoder(obs_batch)
        enc = pre_gru_conv(torch.cat([enc, act_batch], 1))
        prev_h_batch = gru(enc, prev_h_batch)

        losses.append('%.4f' % ((obs_batch - recon)**2).mean())
        if iter_num % episode_length != 0:
            loss = loss + (
                (obs_batch - recon)**2).sum() / float(exp_specs['batch_size'])
            # loss = loss + compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])

        if iter_num % (500 * episode_length) in range(2 * episode_length):
            save_pytorch_tensor_as_img(
                recon[0].data.cpu(),
                'junk_vis/conv_gru_pogrid_len_8_scale_4/rnn_recon_%d.png' %
                iter_num)
            save_pytorch_tensor_as_img(
                obs_batch[0].data.cpu(),
                'junk_vis/conv_gru_pogrid_len_8_scale_4/rnn_obs_%d.png' %
                iter_num)

        if iter_num % exp_specs['freq_val'] == 0:
            print('\nValidating Iter %d...' % iter_num)
            list(
                map(lambda x: x.eval(), [
                    conv_encoder, pre_gru_conv, gru, post_gru_conv,
                    conv_decoder, mean_decoder, log_cov_decoder, act_proc
                ]))

            val_prev_h_batch = Variable(
                torch.zeros(exp_specs['batch_size'], gru_channels, inter_h,
                            inter_h))
            if ptu.gpu_enabled():
                val_prev_h_batch = val_prev_h_batch.cuda()

            losses = []
            for i in range(episode_length):
                obs_batch, act_batch = val_data_loader.get_next_batch()
                act_batch = act_proc(act_batch).view(act_batch.size(0),
                                                     act_channels, inter_h,
                                                     inter_h)

                hidden = post_gru_conv(
                    torch.cat([val_prev_h_batch, act_batch], 1))
                hidden = conv_decoder(hidden)
                recon = mean_decoder(hidden)
                log_cov = log_cov_decoder(hidden)
                log_cov = torch.clamp(log_cov, LOG_COV_MIN, LOG_COV_MAX)

                enc = conv_encoder(obs_batch)
                enc = pre_gru_conv(torch.cat([enc, act_batch], 1))
                val_prev_h_batch = gru(enc, val_prev_h_batch)

                # val_loss = compute_diag_log_prob(recon, log_cov, obs_batch)/float(exp_specs['batch_size'])
                losses.append('%.4f' % ((obs_batch - recon)**2).mean())

            loss_print = '\t'.join(losses)
            print('Val MSE:\t' + loss_print)
            print('Train MSE:\t' + train_loss_print)

            list(
                map(lambda x: x.train(), [
                    conv_encoder, pre_gru_conv, gru, post_gru_conv,
                    conv_decoder, mean_decoder, log_cov_decoder, act_proc
                ]))
コード例 #29
0
        training_env=meta_train_env, # the env used for generating trajectories
        train_task_params_sampler=train_task_params_sampler,
        test_task_params_sampler=test_task_params_sampler,
        **variant['algo_params']
    )

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return 1


if __name__ == '__main__':
    # Arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('-e', '--experiment', help='experiment specification file')
    args = parser.parse_args()
    with open(args.experiment, 'r') as spec_file:
        spec_string = spec_file.read()
        exp_specs = yaml.load(spec_string)
    
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    experiment(exp_specs)
コード例 #30
0
def experiment(exp_specs):
    # Set up logging ----------------------------------------------------------
    exp_id = exp_specs['exp_id']
    exp_prefix = exp_specs['exp_name']
    seed = exp_specs['seed']
    set_seed(seed)
    setup_logger(exp_prefix=exp_prefix, exp_id=exp_id, variant=exp_specs)

    # Load the data -----------------------------------------------------------
    extra_data_path = exp_specs['extra_data_path']
    train_replay_buffer = joblib.load(extra_data_path)['replay_buffer']
    train_replay_buffer.change_max_size_to_cur_size()
    train_replay_buffer._next_obs = train_replay_buffer._next_obs[:,exp_specs['extra_obs_dim']:]
    if exp_specs['remove_env_info']:
        train_replay_buffer._observations = train_replay_buffer._observations[:,exp_specs['extra_obs_dim']:]
    else:
        if exp_specs['normalize_env_info']:
            low, high = exp_specs['env_info_range'][0], exp_specs['env_info_range'][1]
            train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] -= (low + high)/2.0
            train_replay_buffer._observations[:,:exp_specs['extra_obs_dim']] /= (high - low)/2.0

    print('\nRewards: {} +/- {}'.format(
        np.mean(train_replay_buffer._rewards),
        np.std(train_replay_buffer._rewards)
    ))

    next_obs_mean = np.mean(train_replay_buffer._next_obs, 0)
    next_obs_std = np.std(train_replay_buffer._next_obs, 0)
    print('\nNext Obs:\n{}\n+/-\n{}'.format(
        next_obs_mean,
        next_obs_std
    ))

    print('\nAvg Next Obs Square Norm: {}'.format(
        np.mean(np.linalg.norm(train_replay_buffer._next_obs, axis=1)**2)
    ))

    sample_batch = train_replay_buffer.random_batch(exp_specs['train_batch_size'])
    obs_dim = sample_batch['observations'].shape[-1]
    act_dim = sample_batch['actions'].shape[-1]

    val_replay_buffer = SimpleReplayBuffer(exp_specs['val_set_size'], obs_dim, act_dim)
    val_replay_buffer.set_buffer_from_dict(
        train_replay_buffer.sample_and_remove(exp_specs['val_set_size'])
    )
    if exp_specs['train_from_beginning_transitions']:
        trans_dict = dict(
            observations=train_replay_buffer._observations[:exp_specs['train_set_size']],
            actions=train_replay_buffer._actions[:exp_specs['train_set_size']],
            rewards=train_replay_buffer._rewards[:exp_specs['train_set_size']],
            terminals=train_replay_buffer._terminals[:exp_specs['train_set_size']],
            next_observations=train_replay_buffer._next_obs[:exp_specs['train_set_size']],
        )
        train_replay_buffer.set_buffer_from_dict(trans_dict)
    else:
        train_replay_buffer.set_buffer_from_dict(
            train_replay_buffer.sample_and_remove(exp_specs['train_set_size'])
        )

    # Model Definitions -------------------------------------------------------
    if exp_specs['remove_env_info']:
        output_dim = [obs_dim + 1]
    else:
        output_dim = [obs_dim - exp_specs['extra_obs_dim'] + 1]
    model = GenericMap(
        [obs_dim + act_dim],
        output_dim,
        siamese_input=False,
        siamese_output=False,
        num_hidden_layers=exp_specs['num_hidden_layers'],
        hidden_dim=exp_specs['hidden_dim'],
        act='relu',
        use_bn=True,
        deterministic=True
    )

    model_optim = Adam(model.parameters(), lr=float(exp_specs['lr']))

    # Train -------------------------------------------------------------------
    model.train()
    for iter_num in range(exp_specs['max_iters']):
        model_optim.zero_grad()

        batch = train_replay_buffer.random_batch(exp_specs['train_batch_size'])
        batch = convert_numpy_dict_to_pytorch(batch)
        inputs = Variable(torch.cat([batch['observations'], batch['actions']], -1))
        outputs = Variable(torch.cat([batch['next_observations'], batch['rewards']], -1))

        preds = model([inputs])[0]
        if exp_specs['residual']:
            # residual for observations
            preds = preds + Variable(
                        torch.cat(
                            [
                                batch['observations'][:,exp_specs['extra_obs_dim']:],
                                torch.zeros(exp_specs['train_batch_size'], 1)
                            ],
                        1)
                    )
        
        loss = torch.mean(torch.sum((outputs - preds)**2, -1))

        loss.backward()
        model_optim.step()

        if iter_num % exp_specs['freq_val'] == 0:
            model.eval()

            val_batch = val_replay_buffer.random_batch(exp_specs['val_batch_size'])
            val_batch = convert_numpy_dict_to_pytorch(val_batch)
            inputs = Variable(torch.cat([val_batch['observations'], val_batch['actions']], -1))
            outputs = Variable(torch.cat([val_batch['next_observations'], val_batch['rewards']], -1))

            # print(exp_specs['remove_env_info'])
            # print(inputs)
            # print(outputs)
            # sleep(5)
            
            preds = model([inputs])[0]
            if exp_specs['residual']:
                # residual for observations
                preds = preds + Variable(
                            torch.cat(
                                [
                                    val_batch['observations'][:,exp_specs['extra_obs_dim']:],
                                    torch.zeros(exp_specs['train_batch_size'], 1)
                                ],
                            1)
                        )

            loss = torch.mean(torch.sum((outputs - preds)**2, -1))
            next_obs_loss = torch.mean(torch.sum((outputs[:,:-1] - preds[:,:-1])**2, -1))
            rew_loss = torch.mean(torch.sum((outputs[:,-1:] - preds[:,-1:])**2, -1))

            print('\n')
            print('-'*20)
            logger.record_tabular('Iter', iter_num)
            logger.record_tabular('Loss', loss.data[0])
            logger.record_tabular('Obs Loss', next_obs_loss.data[0])
            logger.record_tabular('Rew Loss', rew_loss.data[0])
            logger.dump_tabular(with_prefix=False, with_timestamp=False)

            model.train()