def example(variant): env = HalfCheetahEnv() if variant['normalize']: env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params']) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, epoch_discount_schedule=epoch_discount_schedule, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = get_dim(env.observation_space) action_dim = get_dim(env.action_space) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( obs_dim, action_dim, **variant['qf_params'] ) policy = FeedForwardPolicy( obs_dim, action_dim, 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = HalfCheetahEnv() # env = PointEnv() env = gym_env("Pendulum-v0") # env = HopperEnv() horizon = variant['algo_params']['max_path_length'] env = TimeLimitedEnv(env, horizon) env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) algorithm = MultiStepDdpg( env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params'] ) algorithm.train() return algorithm.final_score
def example(variant): load_policy_file = variant.get('load_policy_file', None) if not load_policy_file == None and exists(load_policy_file): data = joblib.load(load_policy_file) algorithm = data['algorithm'] epochs = algorithm.num_epochs - data['epoch'] algorithm.num_epochs = epochs use_gpu = variant['use_gpu'] if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train() else: es_min_sigma = variant['es_min_sigma'] es_max_sigma = variant['es_max_sigma'] num_epochs = variant['num_epochs'] batch_size = variant['batch_size'] use_gpu = variant['use_gpu'] dueling = variant['dueling'] env = normalize(gym_env('Reacher-v1')) es = OUStrategy( max_sigma=es_max_sigma, min_sigma=es_min_sigma, action_space=env.action_space, ) if dueling: qf = FeedForwardDuelingQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) else: qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) algorithm = DDPG( env, qf, policy, es, num_epochs=num_epochs, batch_size=batch_size, ) if use_gpu: algorithm.cuda() algorithm.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) obs_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) es_class = variant['es_class'] es_params = es_params = dict( action_space=action_space, ) policy_class = variant['policy_class'] use_gpu = variant['use_gpu'] if variant['normalize_env']: env = normalize(env) es = es_class(**es_params) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy_params = dict( obs_dim=int(obs_space.flat_dim), action_dim=int(action_space.flat_dim), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) remote_env = RemoteRolloutEnv( env_class, env_params, policy_class, policy_params, es_class, es_params, variant['max_path_length'], variant['normalize_env'], ) algorithm =ParallelDDPG( remote_env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params'], ) if use_gpu: algorithm.cuda() algorithm.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) obs_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) es_class = variant['es_class'] es_params = dict(action_space=action_space, **variant['es_params']) use_gpu = variant['use_gpu'] es = es_class(**es_params) policy_class = variant['policy_class'] policy_params = dict( obs_dim=int(obs_space.flat_dim), action_dim=int(action_space.flat_dim), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) remote_env = RemoteRolloutEnv( env, policy, exploration_policy, variant['max_path_length'], variant['normalize_env'], ) qf = FeedForwardQFunction( int(remote_env.observation_space.flat_dim), int(remote_env.action_space.flat_dim), 100, 100, ) algorithm = ParallelDDPG( remote_env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'], ) if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_params']) env = normalize(env) es = OUStrategy(action_space=env.action_space, **variant['es_params']) algo_class = variant['algo_class'] algo_params = variant['algo_params'] hidden_size = variant['hidden_size'] if algo_class == DDPG: # algo_params.pop('naf_policy_learning_rate') qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) elif algo_class == NAF: algo_params.pop('qf_learning_rate') # algo_params.pop('policy_learning_rate') qf = NafPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, ) algorithm = NAF(env, policy=qf, exploration_strategy=es, **variant['algo_params']) else: raise Exception("Invalid algo class: {}".format(algo_class)) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) plotter = QFPolicyPlotter( qf=qf, # policy=policy, policy=exploration_policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, render_eval_paths=True, plotter=plotter, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env_class = variant['env_class'] env_params = variant['env_params'] env = env_class(**env_params) normalize(env) es_class = variant['es_class'] es_params = dict(action_space=env.action_space, **variant['es_params']) use_gpu = variant['use_gpu'] es = es_class(**es_params) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy_class = variant['policy_class'] policy_params = dict( obs_dim=get_dim(env.observation_space), action_dim=get_dim(env.action_space), fc1_size=100, fc2_size=100, ) policy = policy_class(**policy_params) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params'], ) if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train()