def experiment(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params']) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, epoch_discount_schedule=epoch_discount_schedule, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = HalfCheetahEnv() # env = PointEnv() env = gym_env("Pendulum-v0") # env = HopperEnv() horizon = variant['algo_params']['max_path_length'] env = TimeLimitedEnv(env, horizon) env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) algorithm = MultiStepDdpg(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) algorithm.train() return algorithm.final_score
def example(variant): env = CartpoleEnv() env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), **variant['qf_params'], ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = HalfCheetahEnv() if variant['normalize']: env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_params']) env = normalize(env) es = OUStrategy( action_space=env.action_space, **variant['es_params'] ) algo_class = variant['algo_class'] algo_params = variant['algo_params'] hidden_size = variant['hidden_size'] if algo_class == DDPG: # algo_params.pop('naf_policy_learning_rate') qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, hidden_size, ) algorithm = DDPG( env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params'] ) elif algo_class == NAF: algo_params.pop('qf_learning_rate') # algo_params.pop('policy_learning_rate') qf = NafPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), hidden_size, ) algorithm = NAF( env, policy=qf, exploration_strategy=es, **variant['algo_params'] ) else: raise Exception("Invalid algo class: {}".format(algo_class)) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) plotter = QFPolicyPlotter( qf=qf, # policy=policy, policy=exploration_policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100 ) algorithm = DDPG( env, qf=qf, policy=policy, exploration_policy=exploration_policy, render_eval_paths=True, plotter=plotter, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()