def experiment(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params']) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, epoch_discount_schedule=epoch_discount_schedule, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = get_dim(env.observation_space) action_dim = get_dim(env.action_space) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( obs_dim, action_dim, **variant['qf_params'] ) policy = FeedForwardPolicy( obs_dim, action_dim, 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = HalfCheetahEnv() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def example(variant): load_policy_file = variant.get('load_policy_file', None) if not load_policy_file == None and exists(load_policy_file): data = joblib.load(load_policy_file) algorithm = data['algorithm'] epochs = algorithm.num_epochs - data['epoch'] algorithm.num_epochs = epochs use_gpu = variant['use_gpu'] if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train() else: es_min_sigma = variant['es_min_sigma'] es_max_sigma = variant['es_max_sigma'] num_epochs = variant['num_epochs'] batch_size = variant['batch_size'] use_gpu = variant['use_gpu'] dueling = variant['dueling'] env = normalize(gym_env('Reacher-v1')) es = OUStrategy( max_sigma=es_max_sigma, min_sigma=es_min_sigma, action_space=env.action_space, ) if dueling: qf = FeedForwardDuelingQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) else: qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) algorithm = DDPG( env, qf, policy, es, num_epochs=num_epochs, batch_size=batch_size, ) if use_gpu: algorithm.cuda() algorithm.train()
def experiment(variant): from railrl.torch.ddpg import DDPG from railrl.launchers.launcher_util import ( set_seed, ) seed = variant['seed'] algo_params = variant['algo_params'] env_params = variant['env_params'] es_class = variant['es_class'] es_params = variant['es_params'] set_seed(seed) env = TwoDPoint(**env_params) es = es_class(env_spec=env.spec, **es_params) algorithm = DDPG(env, es, **algo_params) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_params']) env = normalize(env) es = OUStrategy(action_space=env.action_space, **variant['es_params']) algo_class = variant['algo_class'] algo_params = variant['algo_params'] hidden_size = variant['hidden_size'] if algo_class == DDPG: # algo_params.pop('naf_policy_learning_rate') qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) elif algo_class == NAF: algo_params.pop('qf_learning_rate') # algo_params.pop('policy_learning_rate') qf = NafPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, ) algorithm = NAF(env, policy=qf, exploration_strategy=es, **variant['algo_params']) else: raise Exception("Invalid algo class: {}".format(algo_class)) algorithm.to(ptu.device) algorithm.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) obs_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) es_class = variant['es_class'] es_params = dict( action_space=action_space, **variant['es_params'] ) use_gpu = variant['use_gpu'] es = es_class(**es_params) policy_class = variant['policy_class'] policy_params = dict( obs_dim=int(obs_space.flat_dim), action_dim=int(action_space.flat_dim), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) algorithm = DDPG( env, qf, policy, exploration_policy=exploration_policy, **variant['algo_params'] ) if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def example(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()