Beispiel #1
0
def example(variant):
    env = HalfCheetahEnv()
    if variant['normalize']:
        env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #2
0
def experiment(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    epoch_discount_schedule_class = variant['epoch_discount_schedule_class']
    epoch_discount_schedule = epoch_discount_schedule_class(
        **variant['epoch_discount_schedule_params'])
    algorithm = DDPG(env,
                     exploration_strategy=es,
                     qf=qf,
                     policy=policy,
                     epoch_discount_schedule=epoch_discount_schedule,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    # env = HalfCheetahEnv()
    # env = PointEnv()
    env = gym_env("Pendulum-v0")
    # env = HopperEnv()
    horizon = variant['algo_params']['max_path_length']
    env = TimeLimitedEnv(env, horizon)
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf_hidden_sizes = variant['qf_hidden_sizes']
    qf = EasyVQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    algorithm = EasyVQLearning(env,
                               exploration_strategy=es,
                               qf=qf,
                               policy=policy,
                               **variant['algo_params'])
    algorithm.train()
    return algorithm.final_score
Beispiel #4
0
def example(variant):
    env = variant['env_class']()
    env = NormalizedBoxEnv(env)
    obs_dim = get_dim(env.observation_space)
    action_dim = get_dim(env.action_space)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        obs_dim,
        action_dim,
        **variant['qf_params']
    )
    policy = FeedForwardPolicy(
        obs_dim,
        action_dim,
        400,
        300,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf,
        policy,
        exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #5
0
def example(variant):
    load_policy_file = variant.get('load_policy_file', None)
    if not load_policy_file == None and exists(load_policy_file):
        data = joblib.load(load_policy_file)
        algorithm = data['algorithm']
        epochs = algorithm.num_epochs - data['epoch']
        algorithm.num_epochs = epochs
        use_gpu = variant['use_gpu']
        if use_gpu and ptu.gpu_enabled():
            algorithm.cuda()
        algorithm.train()
    else:
        es_min_sigma = variant['es_min_sigma']
        es_max_sigma = variant['es_max_sigma']
        num_epochs = variant['num_epochs']
        batch_size = variant['batch_size']
        use_gpu = variant['use_gpu']
        dueling = variant['dueling']

        env = normalize(gym_env('Reacher-v1'))
        es = OUStrategy(
            max_sigma=es_max_sigma,
            min_sigma=es_min_sigma,
            action_space=env.action_space,
        )
        if dueling:
            qf = FeedForwardDuelingQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        else:
            qf = FeedForwardQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        policy = FeedForwardPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            100,
            100,
        )
        algorithm = DDPG(
            env,
            qf,
            policy,
            es,
            num_epochs=num_epochs,
            batch_size=batch_size,
        )
        if use_gpu:
            algorithm.cuda()
        algorithm.train()
def experiment(variant):
    env = variant['env_class'](**variant['env_params'])
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space, **variant['es_params'])
    algo_class = variant['algo_class']
    algo_params = variant['algo_params']
    hidden_size = variant['hidden_size']
    if algo_class == DDPG:
        # algo_params.pop('naf_policy_learning_rate')
        qf = FeedForwardQFunction(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
            hidden_size,
        )
        policy = FeedForwardPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
            hidden_size,
        )
        algorithm = DDPG(env,
                         exploration_strategy=es,
                         qf=qf,
                         policy=policy,
                         **variant['algo_params'])
    elif algo_class == NAF:
        algo_params.pop('qf_learning_rate')
        # algo_params.pop('policy_learning_rate')
        qf = NafPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            hidden_size,
        )
        algorithm = NAF(env,
                        policy=qf,
                        exploration_strategy=es,
                        **variant['algo_params'])
    else:
        raise Exception("Invalid algo class: {}".format(algo_class))
    algorithm.to(ptu.device)
    algorithm.train()
def example(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    zf = FeedForwardZFunction(int(env.observation_space.flat_dim),
                              int(env.action_space.flat_dim), 400, 300,
                              **variant['zf_params'])
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    algorithm = DistributionalDDPG(env,
                                   zf=zf,
                                   policy=policy,
                                   exploration_strategy=es,
                                   **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    env = NormalizedBoxEnv(
        MultiGoalEnv(
            actuation_cost_coeff=10,
            distance_cost_coeff=1,
            goal_reward=10,
        ))

    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    plotter = QFPolicyPlotter(
        qf=qf,
        # policy=policy,
        policy=exploration_policy,
        obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]),
        default_action=[np.nan, np.nan],
        n_samples=100)
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     render_eval_paths=True,
                     plotter=plotter,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Beispiel #9
0
def experiment(variant):
    env = variant['env_class'](**variant['env_params'])
    env = normalize(env)
    ddpg1_snapshot_dict = joblib.load(variant['ddpg1_snapshot_path'])
    ddpg2_snapshot_dict = joblib.load(variant['ddpg2_snapshot_path'])
    replay_buffer1 = joblib.load(
        variant['replay_buffer1_path'])['replay_buffer']
    replay_buffer2 = joblib.load(
        variant['replay_buffer2_path'])['replay_buffer']
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
        100,
    )
    algorithm = DdpgQfCombiner(env=env,
                               qf1=ddpg1_snapshot_dict['qf'],
                               qf2=ddpg2_snapshot_dict['qf'],
                               policy=policy,
                               replay_buffer1=replay_buffer1,
                               replay_buffer2=replay_buffer2,
                               **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()