def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) gcm = FlattenMlp(input_size=env.goal_dim + obs_dim + action_dim + 1, output_size=env.goal_dim, **variant['gcm_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + env.goal_dim + 1, output_size=action_dim, **variant['policy_kwargs']) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) gcm_criterion = variant['gcm_criterion_class']( **variant['gcm_criterion_kwargs']) algo_kwargs = variant['algo_kwargs'] algo_kwargs['base_kwargs']['replay_buffer'] = replay_buffer algorithm = GcmDdpg(env, gcm=gcm, policy=policy, exploration_policy=exploration_policy, gcm_criterion=gcm_criterion, **algo_kwargs) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = CylinderXYPusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) vf = FlattenMlp(input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs']) replay_buffer = SimpleHerReplayBuffer(env=env, **variant['replay_buffer_kwargs']) algorithm = HerSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = gym.make(variant['env_id']) env = NormalizedBoxEnv(env) es = GaussianStrategy(action_space=env.action_space, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[128, 128]) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[128, 128], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['sac_tdm_kwargs']['tdm_kwargs']['vectorized'] qf = FlattenMlp(input_size=obs_dim + action_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) vf = FlattenMlp(input_size=obs_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['vf_params']) policy = TanhGaussianPolicy(obs_dim=obs_dim + env.goal_dim + 1, action_dim=action_dim, **variant['policy_params']) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) algorithm = TdmSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['sac_tdm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(Reacher7DofXyzGoalState()) env = NormalizedBoxEnv(MultitaskPoint2DEnv()) vectorized = True policy = StochasticTdmPolicy(env=env, **variant['policy_kwargs']) qf = TdmQf(env=env, vectorized=vectorized, norm_order=2, **variant['qf_kwargs']) vf = TdmVf(env=env, vectorized=vectorized, **variant['vf_kwargs']) replay_buffer_size = variant['algo_params']['base_kwargs'][ 'replay_buffer_size'] replay_buffer = HerReplayBuffer(replay_buffer_size, env) algorithm = TdmSac( env, qf, vf, variant['algo_params']['sac_kwargs'], variant['algo_params']['tdm_kwargs'], variant['algo_params']['base_kwargs'], supervised_weight=variant['algo_params']['supervised_weight'], policy=policy, replay_buffer=replay_buffer, ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['algo_params']['tdm_kwargs']['vectorized'] qf_class = variant['qf_class'] vf_class = variant['vf_class'] policy_class = variant['policy_class'] qf = qf_class(observation_dim=obs_dim, action_dim=action_dim, goal_dim=env.goal_dim, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) vf = vf_class(observation_dim=obs_dim, goal_dim=env.goal_dim, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) policy = policy_class(obs_dim=obs_dim, action_dim=action_dim, goal_dim=env.goal_dim, **variant['policy_params']) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) algorithm = TdmSac(env=env, policy=policy, qf=qf, vf=vf, replay_buffer=replay_buffer, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = NormalizedBoxEnv( MultiGoalEnv( actuation_cost_coeff=10, distance_cost_coeff=1, goal_reward=10, )) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[100, 100], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[100, 100], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) with torch.autograd.profiler.profile() as prof: algorithm.train() prof.export_chrome_trace("tmp-torch-chrome-trace.prof")
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['multitask']: env = MultitaskEnvToSilentMultitaskEnv(env) env = NormalizedBoxEnv(env, **variant['normalize_kwargs']) observation_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) obs_normalizer = TorchFixedNormalizer(observation_dim) action_normalizer = TorchFixedNormalizer(action_dim) delta_normalizer = TorchFixedNormalizer(observation_dim) model = DynamicsModel(observation_dim=observation_dim, action_dim=action_dim, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, delta_normalizer=delta_normalizer, **variant['model_kwargs']) mpc_controller = MPCController(env, model, env.cost_fn, **variant['mpc_controller_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=mpc_controller, ) algo = DistanceModelTrainer(env, model, mpc_controller, exploration_policy=exploration_policy, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, delta_normalizer=delta_normalizer, **variant['algo_kwargs']) if ptu.gpu_enabled(): algo.to(ptu.device) algo.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class'](**variant['env_kwargs'])) if variant['multitask']: env = MultitaskToFlatEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_kwargs']) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = TwinSAC( env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = NormalizedBoxEnv(env, **variant['normalize_kwargs']) if variant['multitask']: env = MultitaskToFlatEnv(env) es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs']) obs_dim = int(env.observation_space.flat_dim) action_dim = int(env.action_space.flat_dim) obs_normalizer = TorchFixedNormalizer(obs_dim) action_normalizer = TorchFixedNormalizer(action_dim) qf = MlpQf(input_size=obs_dim + action_dim, output_size=1, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, obs_normalizer=obs_normalizer, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf, policy, exploration_policy, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, **variant['algo_kwargs']) algorithm.train()
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(gym.make('HalfCheetah-v2')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): # env = normalize(GymEnv( # 'HalfCheetah-v1', # force_reset=True, # record_video=False, # record_log=False, # )) env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic(env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = get_dim(env.observation_space) action_dim = get_dim(env.action_space) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( obs_dim, action_dim, **variant['qf_params'] ) policy = FeedForwardPolicy( obs_dim, action_dim, 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant['multitask']: env = MultitaskFullVAEPoint2DEnv( **variant['env_kwargs']) # used point2d-conv-sweep/run1/id4 env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def experiment(variant): # if variant['multitask']: # env = MultitaskPoint2DEnv(**variant['env_kwargs']) # env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) env_name = variant["env_name"] env = gym.make(env_name) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] env = gym.make(variant['env_id']) env = NormalizedBoxEnv( ImageEnv(env, imsize=imsize, keep_prev=history - 1, init_viewer=variant['init_viewer'])) es = GaussianStrategy(action_space=env.action_space, ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, added_fc_input_size=action_dim, **variant['cnn_params']) qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, added_fc_input_size=action_dim, **variant['cnn_params']) policy = CNNPolicy( input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=history, **variant['cnn_params'], output_activation=torch.tanh, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, policy_and_target_update_period=15, policy_learning_rate=1e-5, **variant['algo_kwargs']) """ algorithm = DDPG( env, qf=qf1, policy=policy, # qf_weight_decay=.01, exploration_policy=exploration_policy, **variant['algo_kwargs'] )""" algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_class = variant['env_class'] env = env_class(**variant['env_params']) env = NormalizedBoxEnv( env, **variant['normalize_params'] ) observation_space = convert_gym_space(env.observation_space) action_space = convert_gym_space(env.action_space) qf = variant['qf_class']( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['qf_params'] ) policy = FFUniversalPolicy( int(observation_space.flat_dim), int(action_space.flat_dim), env.goal_dim, **variant['policy_params'] ) epoch_discount_schedule = None epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] if epoch_discount_schedule_class is not None: epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params'] ) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params'] ) es = variant['sampler_es_class']( action_space=action_space, **variant['sampler_es_params'] ) if variant['explore_with_ddpg_policy']: raw_exploration_policy = policy else: raw_exploration_policy = TerminalRewardSampleOCPolicy( qf, env, 5, ) exploration_policy = UniversalPolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=raw_exploration_policy, ) algo = variant['algo_class']( env, qf, policy, exploration_policy, epoch_discount_schedule=epoch_discount_schedule, qf_criterion=qf_criterion, **variant['algo_params'] ) if ptu.gpu_enabled(): algo.cuda() algo.train()
def experiment(variant): ptu.set_gpu_mode(True, 0) imsize = variant['imsize'] env = ImageForkReacher2dEnv(variant["arm_goal_distance_cost_coeff"], variant["arm_object_distance_cost_coeff"], [imsize, imsize, 3], goal_object_distance_cost_coeff=variant[ "goal_object_distance_cost_coeff"], ctrl_cost_coeff=variant["ctrl_cost_coeff"]) partial_obs_size = env.obs_dim - imsize * imsize * 3 print("partial dim was " + str(partial_obs_size)) env = NormalizedBoxEnv(env) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, added_fc_input_size=action_dim, **variant['cnn_params']) qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, added_fc_input_size=action_dim, **variant['cnn_params']) vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3, **variant['cnn_params']) policy = TanhCNNGaussianPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=3, **variant['cnn_params']) algorithm = TwinSAC(env=env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = Point2DEnv(**variant['env_kwargs']) env = FlatGoalEnv(env) env = NormalizedBoxEnv(env) action_dim = int(np.prod(env.action_space.shape)) obs_dim = int(np.prod(env.observation_space.shape)) qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf1 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) target_qf2 = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = TwinSACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, data_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size variant['algo_kwargs'] = dict( num_epochs=variant['num_epochs'], num_steps_per_epoch=variant['num_steps_per_epoch'], num_steps_per_eval=variant['num_steps_per_eval'], max_path_length=variant['max_path_length'], min_num_steps_before_training=variant['min_num_steps_before_training'], batch_size=variant['batch_size'], discount=variant['discount'], replay_buffer_size=variant['replay_buffer_size'], soft_target_tau=variant['soft_target_tau'], target_update_period=variant['target_update_period'], train_policy_with_reparameterization=variant[ 'train_policy_with_reparameterization'], policy_lr=variant['policy_lr'], qf_lr=variant['qf_lr'], vf_lr=variant['vf_lr'], reward_scale=variant['reward_scale'], use_automatic_entropy_tuning=variant.get( 'use_automatic_entropy_tuning', False)) M = variant['layer_size'] qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[M, M], # **variant['qf_kwargs'] ) vf = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], # **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], # **variant['policy_kwargs'] ) algorithm = SoftActorCritic(env, policy=policy, qf=qf, vf=vf, **variant['algo_kwargs']) if ptu.gpu_enabled(): qf.cuda() vf.cuda() policy.cuda() algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def her_twin_sac_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) observation_key = variant.get('observation_key', 'observation') desired_goal_key = variant.get('desired_goal_key', 'desired_goal') replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = env.observation_space.spaces['observation'].low.size action_dim = env.action_space.low.size goal_dim = env.observation_space.spaces['desired_goal'].low.size if variant['normalize']: env = NormalizedBoxEnv(env) qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) vf = FlattenMlp( input_size=obs_dim + goal_dim, output_size=1, **variant['vf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs'] ) algorithm = HerTwinSac( env, qf1=qf1, qf2=qf2, vf=vf, policy=policy, replay_buffer=replay_buffer, observation_key=observation_key, desired_goal_key=desired_goal_key, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): qf1.to(ptu.device) qf2.to(ptu.device) vf.to(ptu.device) policy.to(ptu.device) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): # env = NormalizedBoxEnv(MultiGoalEnv( # actuation_cost_coeff=10, # distance_cost_coeff=1, # goal_reward=10, # )) env = NormalizedBoxEnv(HalfCheetahEnv()) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # qf = ExpectableQF( # obs_dim=obs_dim, # action_dim=action_dim, # hidden_size=100, # ) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) # TODO(vitchyr): just creating the plotter crashes EC2 # plotter = QFPolicyPlotter( # qf=qf, # policy=policy, # obs_lst=np.array([[-2.5, 0.0], # [0.0, 0.0], # [2.5, 2.5]]), # default_action=[np.nan, np.nan], # n_samples=100 # ) algorithm = ExpectedSAC( env=env, policy=policy, qf=qf, vf=vf, # plotter=plotter, # render_eval_paths=True, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] #env = InvertedDoublePendulumEnv()#gym.make(variant['env_id']) # env = SawyerXYZEnv() env = RandomGoalPusher2DEnv() partial_obs_size = env.obs_dim env = NormalizedBoxEnv( ImageMujocoWithObsEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera'])) # es = GaussianStrategy( # action_space=env.action_space, # ) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, added_fc_input_size=action_dim + partial_obs_size, **variant['cnn_params']) policy = CNNPolicy( input_width=imsize, input_height=imsize, added_fc_input_size=partial_obs_size, output_size=action_dim, input_channels=history, **variant['cnn_params'], output_activation=torch.tanh, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, # qf_weight_decay=.01, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if 'history_len' in variant: history_len = variant['history_len'] env = MultiTaskHistoryEnv(env, history_len=history_len) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) es = OUStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) env.set_goal(variant['goal']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] env = gym.make(variant['env_id']).env training_env = gym.make(variant['env_id']).env env = NormalizedBoxEnv(env) training_env = NormalizedBoxEnv(training_env) env = ImageMujocoEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera']) training_env = ImageMujocoEnv(training_env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera']) env = DiscretizeEnv(env, variant['bins']) training_env = DiscretizeEnv(training_env, variant['bins']) qf = CNN(output_size=env.action_space.n, input_width=imsize, input_height=imsize, input_channels=history, **variant['cnn_params']) qf_criterion = variant['qf_criterion_class']() algorithm = variant['algo_class'](env, training_env=training_env, qf=qf, qf_criterion=qf_criterion, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] env = Pusher2DEnv()#gym.make(variant['env_id']).env env = NormalizedBoxEnv(ImageMujocoEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera'])) # es = GaussianStrategy( # action_space=env.action_space, # ) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels= history, added_fc_input_size=action_dim, **variant['cnn_params']) vf = CNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, **variant['cnn_params']) policy = TanhCNNGaussianPolicy(input_width=imsize, input_height=imsize, output_size=action_dim, input_channels=history, **variant['cnn_params'], ) algorithm = SoftActorCritic( env=env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()