Esempio n. 1
0
def experiment(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    epoch_discount_schedule_class = variant['epoch_discount_schedule_class']
    epoch_discount_schedule = epoch_discount_schedule_class(
        **variant['epoch_discount_schedule_params'])
    algorithm = DDPG(env,
                     exploration_strategy=es,
                     qf=qf,
                     policy=policy,
                     epoch_discount_schedule=epoch_discount_schedule,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 2
0
def example(variant):
    env = variant['env_class']()
    env = NormalizedBoxEnv(env)
    obs_dim = get_dim(env.observation_space)
    action_dim = get_dim(env.action_space)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        obs_dim,
        action_dim,
        **variant['qf_params']
    )
    policy = FeedForwardPolicy(
        obs_dim,
        action_dim,
        400,
        300,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf,
        policy,
        exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
def example(variant):
    env = HalfCheetahEnv()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 4
0
def example(variant):
    load_policy_file = variant.get('load_policy_file', None)
    if not load_policy_file == None and exists(load_policy_file):
        data = joblib.load(load_policy_file)
        algorithm = data['algorithm']
        epochs = algorithm.num_epochs - data['epoch']
        algorithm.num_epochs = epochs
        use_gpu = variant['use_gpu']
        if use_gpu and ptu.gpu_enabled():
            algorithm.cuda()
        algorithm.train()
    else:
        es_min_sigma = variant['es_min_sigma']
        es_max_sigma = variant['es_max_sigma']
        num_epochs = variant['num_epochs']
        batch_size = variant['batch_size']
        use_gpu = variant['use_gpu']
        dueling = variant['dueling']

        env = normalize(gym_env('Reacher-v1'))
        es = OUStrategy(
            max_sigma=es_max_sigma,
            min_sigma=es_min_sigma,
            action_space=env.action_space,
        )
        if dueling:
            qf = FeedForwardDuelingQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        else:
            qf = FeedForwardQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        policy = FeedForwardPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            100,
            100,
        )
        algorithm = DDPG(
            env,
            qf,
            policy,
            es,
            num_epochs=num_epochs,
            batch_size=batch_size,
        )
        if use_gpu:
            algorithm.cuda()
        algorithm.train()
def experiment(variant):
    from railrl.torch.ddpg import DDPG
    from railrl.launchers.launcher_util import (
        set_seed, )
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_params = variant['env_params']
    es_class = variant['es_class']
    es_params = variant['es_params']

    set_seed(seed)
    env = TwoDPoint(**env_params)
    es = es_class(env_spec=env.spec, **es_params)
    algorithm = DDPG(env, es, **algo_params)
    algorithm.train()
def experiment(variant):
    env = variant['env_class'](**variant['env_params'])
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space, **variant['es_params'])
    algo_class = variant['algo_class']
    algo_params = variant['algo_params']
    hidden_size = variant['hidden_size']
    if algo_class == DDPG:
        # algo_params.pop('naf_policy_learning_rate')
        qf = FeedForwardQFunction(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
            hidden_size,
        )
        policy = FeedForwardPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
            hidden_size,
        )
        algorithm = DDPG(env,
                         exploration_strategy=es,
                         qf=qf,
                         policy=policy,
                         **variant['algo_params'])
    elif algo_class == NAF:
        algo_params.pop('qf_learning_rate')
        # algo_params.pop('policy_learning_rate')
        qf = NafPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
        )
        algorithm = NAF(env,
                        policy=qf,
                        exploration_strategy=es,
                        **variant['algo_params'])
    else:
        raise Exception("Invalid algo class: {}".format(algo_class))
    algorithm.to(ptu.device)
    algorithm.train()
def example(variant):
    env_class = variant['env_class']
    env_params = variant['env_params']
    env = env_class(**env_params)
    obs_space = convert_gym_space(env.observation_space)
    action_space = convert_gym_space(env.action_space)
    es_class = variant['es_class']
    es_params = dict(
        action_space=action_space,
        **variant['es_params']
    )
    use_gpu = variant['use_gpu']
    es = es_class(**es_params)
    policy_class = variant['policy_class']
    policy_params = dict(
        obs_dim=int(obs_space.flat_dim),
        action_dim=int(action_space.flat_dim),
        fc1_size=100,
        fc2_size=100,
    )
    policy = policy_class(**policy_params)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    algorithm = DDPG(
        env,
        qf,
        policy,
        exploration_policy=exploration_policy,
        **variant['algo_params']
    )
    if use_gpu and ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
def example(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    algorithm = DDPG(env,
                     exploration_strategy=es,
                     qf=qf,
                     policy=policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()