Exemple #1
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make('HalfCheetah-v2'))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Exemple #2
0
def experiment(variant):
    # env = NormalizedBoxEnv(Reacher7DofXyzGoalState())
    env = NormalizedBoxEnv(MultitaskPoint2DEnv())
    vectorized = True
    policy = StochasticTdmPolicy(env=env, **variant['policy_kwargs'])
    qf = TdmQf(env=env,
               vectorized=vectorized,
               norm_order=2,
               **variant['qf_kwargs'])
    vf = TdmVf(env=env, vectorized=vectorized, **variant['vf_kwargs'])
    replay_buffer_size = variant['algo_params']['base_kwargs'][
        'replay_buffer_size']
    replay_buffer = HerReplayBuffer(replay_buffer_size, env)
    algorithm = TdmSac(
        env,
        qf,
        vf,
        variant['algo_params']['sac_kwargs'],
        variant['algo_params']['tdm_kwargs'],
        variant['algo_params']['base_kwargs'],
        supervised_weight=variant['algo_params']['supervised_weight'],
        policy=policy,
        replay_buffer=replay_buffer,
    )
    if ptu.gpu_enabled():
        algorithm.cuda()

    algorithm.train()
    def __init__(
            self,
            train_dataset,
            test_dataset,
            model,
            batch_size=128,
            log_interval=0,
            lr=1e-3,
            **kwargs
    ):
        self.log_interval = log_interval
        self.batch_size = batch_size

        if ptu.gpu_enabled():
            model.cuda()

        self.model = model
        self.representation_size = model.representation_size

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.train_dataset, self.test_dataset = train_dataset, test_dataset
        assert self.train_dataset['z'].dtype == np.float32
        assert self.test_dataset['z'].dtype ==np.float32
        assert self.train_dataset['z_proj'].dtype == np.float32
        assert self.test_dataset['z_proj'].dtype == np.float32
        self.mse = nn.MSELoss()
Exemple #4
0
    def update_networks_func(algo, epoch):
        if epoch % algo.epoch_freq != 0 and epoch != algo.num_epochs - 1:
            exit()
        if epoch == algo.num_epochs - 1:
            filename = local_path_from_s3_or_local_path(osp.join(variant['ckpt'], 'params.pkl'))
        else:
            filename = local_path_from_s3_or_local_path(osp.join(variant['ckpt'], 'itr_%d.pkl' % epoch))
        print("updating networks from {}".format(filename))
        data = joblib.load(filename)
        assert (data["epoch"] == epoch)
        algo.qf1 = data['qf1']
        algo.qf2 = data['qf2']
        algo.policy = data['trained_policy']
        algo.target_policy = data["target_policy"]
        algo.exploration_policy = data["exploration_policy"]

        if 'n_env_steps_total' in data:
            algo._n_env_steps_total = data["n_env_steps_total"]

        if isinstance(algo.eval_policy, SubgoalPlanner):
            algo.eval_policy.qf = algo.qf1
            algo.eval_policy.mf_policy = algo.policy
        else:
            algo.eval_policy = data["eval_policy"]

        if ptu.gpu_enabled():
            algo.cuda()
        if hasattr(algo, "update_sampler_and_rollout_function"):
            algo.update_sampler_and_rollout_function()
Exemple #5
0
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYZReachingEnv(**env_params)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    hidden_size = variant['hidden_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[hidden_size, hidden_size],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[hidden_size, hidden_size],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[hidden_size, hidden_size],
    )
    es = OUStrategy(action_space=env.action_space, **variant['es_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def experiment(variant):
    from railrl.core import logger
    import railrl.torch.pytorch_util as ptu
    beta = variant["beta"]
    representation_size = variant["representation_size"]
    train_data, test_data, info = generate_vae_dataset(
        **variant['get_data_kwargs'])
    logger.save_extra_data(info)
    logger.get_snapshot_dir()
    if 'beta_schedule_kwargs' in variant:
        beta_schedule = PiecewiseLinearSchedule(
            **variant['beta_schedule_kwargs'])
    else:
        beta_schedule = None
    m = ConvVAE(representation_size, input_channels=3)
    if ptu.gpu_enabled():
        m.to(ptu.device)
        gpu_id = variant.get("gpu_id", None)
        if gpu_id is not None:
            ptu.set_device(gpu_id)
    t = ConvVAETrainer(train_data,
                       test_data,
                       m,
                       beta=beta,
                       beta_schedule=beta_schedule,
                       **variant['algo_kwargs'])
    save_period = variant['save_period']
    for epoch in range(variant['num_epochs']):
        should_save_imgs = (epoch % save_period == 0)
        t.train_epoch(epoch)
        t.test_epoch(epoch,
                     save_reconstruction=should_save_imgs,
                     save_scatterplot=should_save_imgs)
        if should_save_imgs:
            t.dump_samples(epoch)
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYZReachingEnv(**env_params)
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def experiment(variant):
    num_rollouts = variant['num_rollouts']
    H = variant['H']
    render = variant['render']
    data = joblib.load(variant['qf_path'])
    qf = data['qf']
    env = data['env']
    qf_policy = data['policy']
    if ptu.gpu_enabled():
        qf.to(ptu.device)
        qf_policy.to(ptu.device)
    policy_class = variant['policy_class']
    if policy_class == StateOnlySdqBasedSqpOcPolicy:
        policy = policy_class(qf, env, qf_policy, **variant['policy_params'])
    else:
        policy = policy_class(qf, env, **variant['policy_params'])
    paths = []
    for _ in range(num_rollouts):
        goal = env.sample_goal_for_rollout()
        path = multitask_rollout(
            env,
            policy,
            goal,
            discount=variant['discount'],
            max_path_length=H,
            animated=render,
        )
        paths.append(path)
    env.log_diagnostics(paths)
    logger.dump_tabular(with_timestamp=False)
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYZReachingEnv(**env_params)
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SoftActorCritic(env=env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Exemple #10
0
def experiment(variant):
    num_rollouts = variant['num_rollouts']
    H = variant['H']
    render = variant['render']
    data = joblib.load(variant['qf_path'])
    policy_params = variant['policy_params']
    if 'model' in data:
        model = data['model']
    else:
        qf = data['qf']
        model = ModelExtractor(qf)
        policy_params['model_learns_deltas'] = False
    env = data['env']
    if ptu.gpu_enabled():
        model.to(ptu.device)
    policy = variant['policy_class'](
        model,
        env,
        **policy_params
    )
    paths = []
    for _ in range(num_rollouts):
        goal = env.sample_goal_for_rollout()
        path = multitask_rollout(
            env,
            policy,
            goal,
            discount=0,
            max_path_length=H,
            animated=render,
        )
        paths.append(path)
    env.log_diagnostics(paths)
    logger.dump_tabular(with_timestamp=False)
Exemple #11
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    if variant['multitask']:
        env = MultitaskEnvToSilentMultitaskEnv(env)
    env = NormalizedBoxEnv(env, **variant['normalize_kwargs'])

    observation_dim = int(np.prod(env.observation_space.low.shape))
    action_dim = int(np.prod(env.action_space.low.shape))
    obs_normalizer = TorchFixedNormalizer(observation_dim)
    action_normalizer = TorchFixedNormalizer(action_dim)
    delta_normalizer = TorchFixedNormalizer(observation_dim)
    model = DynamicsModel(observation_dim=observation_dim,
                          action_dim=action_dim,
                          obs_normalizer=obs_normalizer,
                          action_normalizer=action_normalizer,
                          delta_normalizer=delta_normalizer,
                          **variant['model_kwargs'])
    mpc_controller = MPCController(env, model, env.cost_fn,
                                   **variant['mpc_controller_kwargs'])
    es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=mpc_controller,
    )
    algo = DistanceModelTrainer(env,
                                model,
                                mpc_controller,
                                exploration_policy=exploration_policy,
                                obs_normalizer=obs_normalizer,
                                action_normalizer=action_normalizer,
                                delta_normalizer=delta_normalizer,
                                **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        algo.to(ptu.device)
    algo.train()
Exemple #12
0
def experiment(variant):
    # if variant['multitask']:
    #     env = MultitaskPoint2DEnv(**variant['env_kwargs'])
    #     env = MultitaskToFlatEnv(env)
    # else:
        # env = Pusher2DEnv(**variant['env_kwargs'])
    env_name = variant["env_name"]
    env = gym.make(env_name)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(
        env,
        qf1=qf1,
        qf2=qf2,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()
Exemple #13
0
def experiment(variant):
    env_class = variant['env_class']
    env = env_class(**variant['env_params'])
    env = NormalizedBoxEnv(
        env,
        **variant['normalize_params']
    )
    observation_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    qf = variant['qf_class'](
        int(observation_space.flat_dim),
        int(action_space.flat_dim),
        env.goal_dim,
        **variant['qf_params']
    )
    policy = FFUniversalPolicy(
        int(observation_space.flat_dim),
        int(action_space.flat_dim),
        env.goal_dim,
        **variant['policy_params']
    )
    epoch_discount_schedule = None
    epoch_discount_schedule_class = variant['epoch_discount_schedule_class']
    if epoch_discount_schedule_class is not None:
        epoch_discount_schedule = epoch_discount_schedule_class(
            **variant['epoch_discount_schedule_params']
        )
    qf_criterion = variant['qf_criterion_class'](
        **variant['qf_criterion_params']
    )
    es = variant['sampler_es_class'](
        action_space=action_space,
        **variant['sampler_es_params']
    )
    if variant['explore_with_ddpg_policy']:
        raw_exploration_policy = policy
    else:
        raw_exploration_policy = TerminalRewardSampleOCPolicy(
            qf,
            env,
            5,
        )
    exploration_policy = UniversalPolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=raw_exploration_policy,
    )
    algo = variant['algo_class'](
        env,
        qf,
        policy,
        exploration_policy,
        epoch_discount_schedule=epoch_discount_schedule,
        qf_criterion=qf_criterion,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algo.cuda()
    algo.train()
Exemple #14
0
def example(variant):
    load_policy_file = variant.get('load_policy_file', None)
    if not load_policy_file == None and exists(load_policy_file):
        data = joblib.load(load_policy_file)
        algorithm = data['algorithm']
        epochs = algorithm.num_epochs - data['epoch']
        algorithm.num_epochs = epochs
        use_gpu = variant['use_gpu']
        if use_gpu and ptu.gpu_enabled():
            algorithm.cuda()
        algorithm.train()
    else:
        es_min_sigma = variant['es_min_sigma']
        es_max_sigma = variant['es_max_sigma']
        num_epochs = variant['num_epochs']
        batch_size = variant['batch_size']
        use_gpu = variant['use_gpu']
        dueling = variant['dueling']

        env = normalize(gym_env('Reacher-v1'))
        es = OUStrategy(
            max_sigma=es_max_sigma,
            min_sigma=es_min_sigma,
            action_space=env.action_space,
        )
        if dueling:
            qf = FeedForwardDuelingQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        else:
            qf = FeedForwardQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        policy = FeedForwardPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            100,
            100,
        )
        algorithm = DDPG(
            env,
            qf,
            policy,
            es,
            num_epochs=num_epochs,
            batch_size=batch_size,
        )
        if use_gpu:
            algorithm.cuda()
        algorithm.train()
Exemple #15
0
def resume_torch_algorithm_simple(variant):
    from railrl.torch import pytorch_util as ptu
    load_file = variant.get('params_file', None)
    if load_file is not None and osp.exists(load_file):
        data = joblib.load(load_file)
        algorithm = data['algorithm']
        epoch = data['epoch'] + 1
        if ptu.gpu_enabled():
            algorithm.cuda()
        algorithm.train(start_epoch=epoch + 1)
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class']())
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    variant['algo_kwargs'] = dict(
        num_epochs=variant['num_epochs'],
        num_steps_per_epoch=variant['num_steps_per_epoch'],
        num_steps_per_eval=variant['num_steps_per_eval'],
        max_path_length=variant['max_path_length'],
        min_num_steps_before_training=variant['min_num_steps_before_training'],
        batch_size=variant['batch_size'],
        discount=variant['discount'],
        replay_buffer_size=variant['replay_buffer_size'],
        soft_target_tau=variant['soft_target_tau'],
        target_update_period=variant['target_update_period'],
        train_policy_with_reparameterization=variant[
            'train_policy_with_reparameterization'],
        policy_lr=variant['policy_lr'],
        qf_lr=variant['qf_lr'],
        vf_lr=variant['vf_lr'],
        reward_scale=variant['reward_scale'],
        use_automatic_entropy_tuning=variant.get(
            'use_automatic_entropy_tuning', False))

    M = variant['layer_size']
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
        # **variant['qf_kwargs']
    )
    vf = FlattenMlp(
        input_size=obs_dim,
        output_size=1,
        hidden_sizes=[M, M],
        # **variant['vf_kwargs']
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
        # **variant['policy_kwargs']
    )
    algorithm = SoftActorCritic(env,
                                policy=policy,
                                qf=qf,
                                vf=vf,
                                **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        qf.cuda()
        vf.cuda()
        policy.cuda()
        algorithm.cuda()
    algorithm.train()
Exemple #17
0
def experiment(variant):
    env = DiscreteReacherEnv(**variant['env_params'])
    qf = Mlp(
        hidden_sizes=[32, 32],
        input_size=int(np.prod(env.observation_space.shape)),
        output_size=env.action_space.n,
    )
    algorithm = DQN(env, qf=qf, **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def her_twin_sac_experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    observation_key = variant.get('observation_key', 'observation')
    desired_goal_key = variant.get('desired_goal_key', 'desired_goal')
    replay_buffer = ObsDictRelabelingBuffer(
        env=env,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
        **variant['replay_buffer_kwargs']
    )
    obs_dim = env.observation_space.spaces['observation'].low.size
    action_dim = env.action_space.low.size
    goal_dim = env.observation_space.spaces['desired_goal'].low.size
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + goal_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + goal_dim,
        output_size=1,
        **variant['qf_kwargs']
    )
    vf = FlattenMlp(
        input_size=obs_dim + goal_dim,
        output_size=1,
        **variant['vf_kwargs']
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim + goal_dim,
        action_dim=action_dim,
        **variant['policy_kwargs']
    )
    algorithm = HerTwinSac(
        env,
        qf1=qf1,
        qf2=qf2,
        vf=vf,
        policy=policy,
        replay_buffer=replay_buffer,
        observation_key=observation_key,
        desired_goal_key=desired_goal_key,
        **variant['algo_kwargs']
    )
    if ptu.gpu_enabled():
        qf1.to(ptu.device)
        qf2.to(ptu.device)
        vf.to(ptu.device)
        policy.to(ptu.device)
        algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    from railrl.core import logger
    import railrl.torch.pytorch_util as ptu
    beta = variant["beta"]
    representation_size = variant["representation_size"]
    # train_data, test_data, info = generate_vae_dataset(
    #     **variant['get_data_kwargs']
    # )
    num_divisions = 5
    images = np.zeros((num_divisions * 10000, 21168))
    for i in range(num_divisions):
        imgs = np.load(
            '/home/murtaza/vae_data/sawyer_torque_control_images100000_' +
            str(i + 1) + '.npy')
        images[i * 10000:(i + 1) * 10000] = imgs
        print(i)
    mid = int(num_divisions * 10000 * .9)
    train_data, test_data = images[:mid], images[mid:]
    info = dict()

    logger.save_extra_data(info)
    logger.get_snapshot_dir()
    if 'beta_schedule_kwargs' in variant:
        kwargs = variant['beta_schedule_kwargs']
        kwargs['y_values'][2] = variant['beta']
        kwargs['x_values'][1] = variant['flat_x']
        kwargs['x_values'][2] = variant['ramp_x'] + variant['flat_x']
        beta_schedule = PiecewiseLinearSchedule(
            **variant['beta_schedule_kwargs'])
    else:
        beta_schedule = None
    m = ConvVAE(representation_size,
                input_channels=3,
                **variant['conv_vae_kwargs'])
    if ptu.gpu_enabled():
        m.cuda()
    t = ConvVAETrainer(train_data,
                       test_data,
                       m,
                       beta=beta,
                       beta_schedule=beta_schedule,
                       **variant['algo_kwargs'])
    save_period = variant['save_period']
    for epoch in range(variant['num_epochs']):
        should_save_imgs = (epoch % save_period == 0)
        t.train_epoch(epoch)
        t.test_epoch(epoch,
                     save_reconstruction=should_save_imgs,
                     save_scatterplot=should_save_imgs)
        if should_save_imgs:
            t.dump_samples(epoch)
Exemple #20
0
    def __init__(
            self,
            train_dataset,
            test_dataset,
            model,
            batch_size=128,
            beta=0.5,
            beta_schedule=None,
            lr=1e-3,
            extra_recon_logging=dict(),
            recon_weights=None,
            recon_loss_type='mse',
            **kwargs
    ):
        assert recon_loss_type in ['mse', 'wse']
        self.batch_size = batch_size
        self.beta = beta
        self.beta_schedule = beta_schedule
        if self.beta_schedule is None:
            self.beta_schedule = ConstantSchedule(self.beta)

        if ptu.gpu_enabled():
            model.cuda()

        self.model = model
        self.representation_size = model.representation_size

        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.train_dataset, self.test_dataset = train_dataset, test_dataset
        assert self.train_dataset['next_obs'].dtype == np.float32
        assert self.test_dataset['next_obs'].dtype ==np.float32
        assert self.train_dataset['obs'].dtype == np.float32
        assert self.test_dataset['obs'].dtype == np.float32
        self.normalize = model.normalize
        self.mse = nn.MSELoss()

        if self.normalize:
            self.train_data_mean = ptu.np_to_var(np.mean(self.train_dataset['next_obs'], axis=0))
            np_std = np.std(self.train_dataset['next_obs'], axis=0)
            for i in range(len(np_std)):
                if np_std[i] < 1e-3:
                    np_std[i] = 1.0
            self.train_data_std = ptu.np_to_var(np_std)

            self.model.train_data_mean = self.train_data_mean
            self.model.train_data_std = self.train_data_std

        self.extra_recon_logging = extra_recon_logging
        self.recon_weights = recon_weights
        self.recon_loss_type = recon_loss_type
Exemple #21
0
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYReachingEnv(**env_params)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[100, 100],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[100, 100],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[100, 100],
    )
    # es = GaussianStrategy(
    #     action_space=env.action_space,
    #     **variant['es_kwargs']
    # )
    # es = EpsilonGreedy(
    #     action_space=env.action_space,
    #     prob_random_action=0.2,
    # )
    es = OUStrategy(
        action_space=env.action_space,
        **variant['es_kwargs']
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(
        env,
        qf1=qf1,
        qf2=qf2,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def experiment(variant):
    env_params = variant['env_params']
    env = MultiTaskSawyerXYZReachingEnv(env_params)
    tdm_normalizer = TdmNormalizer(
        env,
        vectorized=True,
        max_tau=variant['ddpg_tdm_kwargs']['tdm_kwargs']['max_tau'],
    )
    qf = TdmQf(
        env=env,
        vectorized=True,
        hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']],
        structure='norm_difference',
        tdm_normalizer=tdm_normalizer,
    )
    policy = TdmPolicy(
        env=env,
        hidden_sizes=[variant['hidden_sizes'], variant['hidden_sizes']],
        tdm_normalizer=tdm_normalizer,
    )
    es = OUStrategy(
        action_space=env.action_space,
        **variant['es_kwargs']
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = HerReplayBuffer(
        env=env,
        **variant['her_replay_buffer_kwargs']
    )
    qf_criterion = variant['qf_criterion_class']()
    ddpg_tdm_kwargs = copy.deepcopy(variant['ddpg_tdm_kwargs'])
    ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion
    algorithm = TdmDdpg(
        env,
        qf=qf,
        replay_buffer=replay_buffer,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['ddpg_tdm_kwargs']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def example(variant):
    env_class = variant['env_class']
    env_params = variant['env_params']
    env = env_class(**env_params)
    obs_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    es_class = variant['es_class']
    es_params = dict(action_space=action_space, **variant['es_params'])
    use_gpu = variant['use_gpu']
    es = es_class(**es_params)
    policy_class = variant['policy_class']
    policy_params = dict(
        obs_dim=int(obs_space.flat_dim),
        action_dim=int(action_space.flat_dim),
        fc1_size=100,
        fc2_size=100,
    )
    policy = policy_class(**policy_params)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    remote_env = RemoteRolloutEnv(
        env,
        policy,
        exploration_policy,
        variant['max_path_length'],
        variant['normalize_env'],
    )
    qf = FeedForwardQFunction(
        int(remote_env.observation_space.flat_dim),
        int(remote_env.action_space.flat_dim),
        100,
        100,
    )
    algorithm = ParallelDDPG(
        remote_env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params'],
    )
    if use_gpu and ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def experiment(variant):
    from railrl.core import logger
    import railrl.torch.pytorch_util as ptu
    beta = variant["beta"]
    representation_size = variant["representation_size"]
    #this has both states and images so can't use generate vae dataset
    X = np.load(
        '/home/murtaza/vae_data/sawyer_torque_control_ou_imgs_zoomed_out10000.npy'
    )
    Y = np.load(
        '/home/murtaza/vae_data/sawyer_torque_control_ou_states_zoomed_out10000.npy'
    )
    Y = np.concatenate((Y[:, :7], Y[:, 14:]), axis=1)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.1)
    info = dict()
    logger.save_extra_data(info)
    logger.get_snapshot_dir()
    if 'beta_schedule_kwargs' in variant:
        beta_schedule = PiecewiseLinearSchedule(
            **variant['beta_schedule_kwargs'])
    else:
        beta_schedule = None
    m = ConvVAE(representation_size,
                input_channels=3,
                state_sim_debug=True,
                state_size=Y.shape[1],
                **variant['conv_vae_kwargs'])
    if ptu.gpu_enabled():
        m.cuda()
    t = ConvVAETrainer((X_train, Y_train), (X_test, Y_test),
                       m,
                       beta=beta,
                       beta_schedule=beta_schedule,
                       state_sim_debug=True,
                       **variant['algo_kwargs'])
    save_period = variant['save_period']
    for epoch in range(variant['num_epochs']):
        should_save_imgs = (epoch % save_period == 0)
        t.train_epoch(epoch)
        t.test_epoch(epoch,
                     save_reconstruction=should_save_imgs,
                     save_scatterplot=should_save_imgs)
        if should_save_imgs:
            t.dump_samples(epoch)
def experiment(variant):
    env = NormalizedBoxEnv(MultitaskPoint2DEnv())  #try full state reacher
    # env = Reacher7DofMultitaskEnv()
    es = OUStrategy(action_space=env.action_space)
    policy = TdmPolicy(env=env, **variant['policy_kwargs'])
    replay_buffer_size = variant['algo_params']['base_kwargs'][
        'replay_buffer_size']
    replay_buffer = TauReplayBuffer(replay_buffer_size, env)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TdmSupervised(env,
                              replay_buffer=replay_buffer,
                              policy=policy,
                              exploration_policy=exploration_policy,
                              **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Exemple #26
0
 def __init__(
     self,
     X_train,
     X_test,
     y_train,
     y_test,
     model,
     batch_size=128,
     lr=3e-4,
     weight_decay=0,
     num_batches=128,
 ):
     self.batch_size = batch_size
     if ptu.gpu_enabled():
         model.to(ptu.device)
     self.model = model
     self.criterion = nn.MSELoss()
     self.optimizer = optim.Adam(model.parameters(),
                                 lr=lr,
                                 weight_decay=weight_decay)
     self.X_train, self.X_test, self.y_train, self.y_test = X_train, X_test, y_train, y_test
     self.num_batches = num_batches
def experiment(variant):
    from railrl.core import logger
    import railrl.torch.pytorch_util as ptu
    beta = variant["beta"]
    representation_size = variant["representation_size"]
    train_data, test_data, info = get_data(**variant['get_data_kwargs'])
    logger.save_extra_data(info)
    logger.get_snapshot_dir()
    beta_schedule = PiecewiseLinearSchedule(**variant['beta_schedule_kwargs'])
    m = ConvVAE(representation_size, input_channels=3)
    if ptu.gpu_enabled():
        m.to(ptu.device)
    t = ConvVAETrainer(train_data,
                       test_data,
                       m,
                       beta=beta,
                       beta_schedule=beta_schedule,
                       **variant['algo_kwargs'])
    for epoch in range(variant['num_epochs']):
        t.train_epoch(epoch)
        t.test_epoch(epoch)
        t.dump_samples(epoch)
def experiment(variant):
    from railrl.core import logger
    import railrl.torch.pytorch_util as ptu
    beta = variant["beta"]
    representation_size = variant["representation_size"]
    train_data, test_data, info = generate_vae_dataset(
        **variant['generate_vae_dataset_kwargs'])
    logger.save_extra_data(info)
    logger.get_snapshot_dir()
    if 'beta_schedule_kwargs' in variant:
        # kwargs = variant['beta_schedule_kwargs']
        # kwargs['y_values'][2] = variant['beta']
        # kwargs['x_values'][1] = variant['flat_x']
        # kwargs['x_values'][2] = variant['ramp_x'] + variant['flat_x']
        variant['beta_schedule_kwargs']['y_values'][-1] = variant['beta']
        beta_schedule = PiecewiseLinearSchedule(
            **variant['beta_schedule_kwargs'])
    else:
        beta_schedule = None
    m = ConvVAE(representation_size,
                input_channels=3,
                **variant['conv_vae_kwargs'])
    if ptu.gpu_enabled():
        m.cuda()
    t = ConvVAETrainer(train_data,
                       test_data,
                       m,
                       beta=beta,
                       beta_schedule=beta_schedule,
                       **variant['algo_kwargs'])
    save_period = variant['save_period']
    for epoch in range(variant['num_epochs']):
        should_save_imgs = (epoch % save_period == 0)
        t.train_epoch(epoch)
        t.test_epoch(epoch,
                     save_reconstruction=should_save_imgs,
                     save_scatterplot=should_save_imgs)
        if should_save_imgs:
            t.dump_samples(epoch)
Exemple #29
0
def experiment(variant):
    env_params = variant['env_params']
    env = MultiTaskBaxterEnv(**env_params)

    observation_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    qf = FlatUniversalQfunction(
        int(observation_space.flat_dim),
        int(action_space.flat_dim),
        env.goal_dim,
        **variant['qf_params'],
    )

    policy = FFUniversalPolicy(int(observation_space.flat_dim),
                               int(action_space.flat_dim), env.goal_dim,
                               **variant['policy_params'])

    es = variant['sampler_es_class'](action_space=action_space,
                                     **variant['sampler_es_params'])

    exploration_policy = UniversalPolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    epoch_discount_schedule = variant['epoch_discount_schedule_class'](
        **variant['epoch_discount_schedule_params'])

    algo = HorizonFedStateDistanceQLearning(
        env,
        qf,
        policy,
        exploration_policy,
        qf_criterion=HuberLoss(),
        epoch_discount_schedule=epoch_discount_schedule,
        **variant['algo_params'])
    if ptu.gpu_enabled():
        algo.cuda()
    algo.train()
Exemple #30
0
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class']())
    es = OUStrategy(
        action_space=env.action_space,
        max_sigma=0.1,
        min_sigma=0.1,  # Constant sigma
    )
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    env.set_goal(variant['goal'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()