def run_experiment(self): all_imgs = [] policy = OUStrategy(env.action_space) for i in range(self.num_episodes): state = self.env.reset() img = ptu.from_numpy(state['image_observation']).view(1, 6912) latent_state = self.vae.encode(img)[0] true_curr = state['image_observation'] * 255.0 all_imgs.append(ptu.from_numpy(true_curr).view(3, 48, 48)) actions = [] for j in range(self.episode_len): u = policy.get_action_from_raw_action(env.action_space.sample()) actions.append(u) state = self.env.step(u)[0] true_curr = state['image_observation'] * 255.0 all_imgs.append(ptu.from_numpy(true_curr).view(3, 48, 48)) pred_curr = self.vae.decode(latent_state)[0] * 255.0 all_imgs.append(pred_curr.view(3, 48, 48)) for j in range(self.episode_len): u = ptu.from_numpy(actions[j]).view(1, 2) latent_state = self.vae.process_dynamics(latent_state, u) pred_curr = self.vae.decode(latent_state)[0] * 255.0 all_imgs.append(pred_curr.view(3, 48, 48)) all_imgs = torch.stack(all_imgs) save_image( all_imgs.data, "/home/khazatsky/rail/data/rail-khazatsky/sasha/dynamics_visualizer/dynamics.png", nrow=self.episode_len + 1, )
def generate_vae_dataset( N=10000, test_p=0.9, use_cached=True, imsize=84, show=False, dataset_path=None, ): filename = "/tmp/sawyer_push_new_easy_wider2_" + str(N) + ".npy" info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) elif use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() env = SawyerPushXYEasyEnv(hide_goal=True) env = ImageMujocoEnv( env, imsize, transpose=True, init_camera=sawyer_init_camera_zoomed_in, # init_camera=sawyer_init_camera, normalize=True, ) info['env'] = env policy = OUStrategy(env.action_space) dataset = np.zeros((N, imsize * imsize * 3)) for i in range(N): # env.reset() if i % 100 == 0: g = env.sample_goal_for_rollout() env.set_goal(g) policy.reset() u = policy.get_action_from_raw_action(env.action_space.sample()) img = env.step(u)[0] dataset[i, :] = img if show: # env.render() cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print("done making training data", filename, time.time() - now) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def experiment(variant): # env = HalfCheetahEnv() # env = PointEnv() env = gym_env("Pendulum-v0") # env = HopperEnv() horizon = variant['algo_params']['max_path_length'] env = TimeLimitedEnv(env, horizon) env = normalize(env) es = OUStrategy(action_space=env.action_space) qf_hidden_sizes = variant['qf_hidden_sizes'] qf = EasyVQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), qf_hidden_sizes, qf_hidden_sizes, qf_hidden_sizes, qf_hidden_sizes, qf_hidden_sizes, qf_hidden_sizes, qf_hidden_sizes, qf_hidden_sizes, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) algorithm = EasyVQLearning(env, exploration_strategy=es, qf=qf, policy=policy, **variant['algo_params']) algorithm.train() return algorithm.final_score
def get_exploration_strategy(variant, env): from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy from railrl.exploration_strategies.ou_strategy import OUStrategy exploration_type = variant.get('exploration_type', 'epsilon') exploration_noise = variant.get('exploration_noise', 0.1) if exploration_type == 'ou': es = OUStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=exploration_noise, min_sigma=exploration_noise, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=exploration_noise, ) else: raise Exception("Invalid type: " + exploration_type) return es
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['multitask']: env = MultitaskEnvToSilentMultitaskEnv(env) env = NormalizedBoxEnv(env, **variant['normalize_kwargs']) observation_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) obs_normalizer = TorchFixedNormalizer(observation_dim) action_normalizer = TorchFixedNormalizer(action_dim) delta_normalizer = TorchFixedNormalizer(observation_dim) model = DynamicsModel(observation_dim=observation_dim, action_dim=action_dim, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, delta_normalizer=delta_normalizer, **variant['model_kwargs']) mpc_controller = MPCController(env, model, env.cost_fn, **variant['mpc_controller_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=mpc_controller, ) algo = DistanceModelTrainer(env, model, mpc_controller, exploration_policy=exploration_policy, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, delta_normalizer=delta_normalizer, **variant['algo_kwargs']) if ptu.gpu_enabled(): algo.to(ptu.device) algo.train()
def example(variant): env = variant['env_class']() if variant['normalize']: env = NormalizedBoxEnv(env) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['vf_params']) vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_params']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = N3DPG(env, qf=qf, vf=vf, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 400, 300, ) epoch_discount_schedule_class = variant['epoch_discount_schedule_class'] epoch_discount_schedule = epoch_discount_schedule_class( **variant['epoch_discount_schedule_params']) algorithm = DDPG(env, exploration_strategy=es, qf=qf, policy=policy, epoch_discount_schedule=epoch_discount_schedule, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = HalfCheetahEnv() if variant['normalize']: env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 32, 32, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def example(variant): env = variant['env_class']() env = NormalizedBoxEnv(env) obs_dim = get_dim(env.observation_space) action_dim = get_dim(env.action_space) es = OUStrategy(action_space=env.action_space) qf = FeedForwardQFunction( obs_dim, action_dim, **variant['qf_params'] ) policy = FeedForwardPolicy( obs_dim, action_dim, 400, 300, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf, policy, exploration_policy, **variant['algo_params'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size hidden_size = variant['hidden_size'] qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[hidden_size, hidden_size], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[hidden_size, hidden_size], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[hidden_size, hidden_size], ) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) env = NormalizedBoxEnv(env, **variant['normalize_kwargs']) if variant['multitask']: env = MultitaskToFlatEnv(env) es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs']) obs_dim = int(env.observation_space.flat_dim) action_dim = int(env.action_space.flat_dim) obs_normalizer = TorchFixedNormalizer(obs_dim) action_normalizer = TorchFixedNormalizer(action_dim) qf = MlpQf(input_size=obs_dim + action_dim, output_size=1, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim, output_size=action_dim, obs_normalizer=obs_normalizer, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf, policy, exploration_policy, obs_normalizer=obs_normalizer, action_normalizer=action_normalizer, **variant['algo_kwargs']) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYZReachingEnv(**env_params) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG(env, qf=qf, policy=policy, exploration_policy=exploration_policy, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) # env = NormalizedBoxEnv(env) # tdm_normalizer = TdmNormalizer( # env, # vectorized=True, # max_tau=variant['algo_kwargs']['tdm_kwargs']['max_tau'], # ) tdm_normalizer = None qf = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() ddpg_tdm_kwargs = variant['algo_kwargs'] ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = NormalizedBoxEnv(variant['env_class']()) obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) gcm = FlattenMlp(input_size=env.goal_dim + obs_dim + action_dim + 1, output_size=env.goal_dim, **variant['gcm_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + env.goal_dim + 1, output_size=action_dim, **variant['policy_kwargs']) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_kwargs']) gcm_criterion = variant['gcm_criterion_class']( **variant['gcm_criterion_kwargs']) algo_kwargs = variant['algo_kwargs'] algo_kwargs['base_kwargs']['replay_buffer'] = replay_buffer algorithm = GcmDdpg(env, gcm=gcm, policy=policy, exploration_policy=exploration_policy, gcm_criterion=gcm_criterion, **algo_kwargs) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): if variant['multitask']: env = MultitaskFullVAEPoint2DEnv( **variant['env_kwargs']) # used point2d-conv-sweep/run1/id4 env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) if variant["use_gpu"]: gpu_id = variant["gpu_id"] ptu.set_gpu_mode(True) ptu.set_device(gpu_id) algorithm.to(ptu.device) env._wrapped_env.vae.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['algo_params']['tdm_kwargs']['vectorized'] # qf = StructuredQF( # observation_dim=obs_dim, # action_dim=action_dim, # goal_dim=env.goal_dim, # output_size=env.goal_dim if vectorized else 1, # **variant['qf_params'] # ) qf = OneHotTauQF(observation_dim=obs_dim, action_dim=action_dim, goal_dim=env.goal_dim, output_size=env.goal_dim if vectorized else 1, **variant['qf_params']) vf = FlattenMlp(input_size=obs_dim + env.goal_dim + 1, output_size=env.goal_dim if vectorized else 1, **variant['vf_params']) policy = MlpPolicy(input_size=obs_dim + env.goal_dim + 1, output_size=action_dim, **variant['policy_params']) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params']) algo_params = variant['algo_params'] algo_params['n3dpg_kwargs']['qf_criterion'] = qf_criterion plotter = Simple1DTdmPlotter( tdm=qf, # location_lst=np.array([-10, 0, 10]), # goal_lst=np.array([-10, 0, 5]), location_lst=np.array([-5, 0, 5]), goal_lst=np.array([-5, 0, 5]), max_tau=algo_params['tdm_kwargs']['max_tau'], grid_size=10, ) algo_params['n3dpg_kwargs']['plotter'] = plotter algorithm = TdmN3dpg(env, qf=qf, vf=vf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_params) algorithm.to(ptu.device) algorithm.train()
def generate_vae_dataset( N=10000, test_p=0.9, use_cached=True, imsize=84, show=False, dataset_path=None, policy_path=None, action_space_sampling=False, env_class=SawyerDoorEnv, env_kwargs=None, init_camera=sawyer_door_env_camera_v2, ): if policy_path is not None: filename = "/tmp/sawyer_door_pull_open_oracle+random_policy_data_closer_zoom_action_limited" + str(N) + ".npy" elif action_space_sampling: filename = "/tmp/sawyer_door_pull_open_zoomed_in_action_space_sampling" + str(N) + ".npy" else: filename = "/tmp/sawyer_door_pull_open" + str(N) + ".npy" info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) elif use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() env = env_class(**env_kwargs) env = ImageEnv( env, imsize, transpose=True, init_camera=init_camera, normalize=True, ) info['env'] = env policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) env.wrapped_env.reset() dataset = np.zeros((N, imsize * imsize * 3), dtype=np.uint8) for i in range(N): if i % 20==0: env.reset_model() exploration_policy.reset() for _ in range(10): action = exploration_policy.get_action()[0] env.wrapped_env.step( action ) # env.set_to_goal_angle(env.get_goal()['state_desired_goal']) img = env._get_flat_img() dataset[i, :] = unormalize_image(img) if show: cv2.imshow('img', img.reshape(3, 84, 84).transpose()) cv2.waitKey(1) print(i) print("done making training data", filename, time.time() - now) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def experiment(variant): # if variant['multitask']: # env = MultitaskPoint2DEnv(**variant['env_kwargs']) # env = MultitaskToFlatEnv(env) # else: # env = Pusher2DEnv(**variant['env_kwargs']) env_name = variant["env_name"] env = gym.make(env_name) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train()
def generate_goal_data_set(env=None, num_goals=1000, use_cached_dataset=False, action_scale=1 / 10): if use_cached_dataset and osp.isfile( '/tmp/goals' + str(num_goals) + '.npy'): goal_dict = np.load('/tmp/goals' + str(num_goals) + '.npy').item() print("loaded data from saved file") return goal_dict cached_goal_keys = [ 'latent_desired_goal', 'image_desired_goal', 'state_desired_goal', 'joint_desired_goal', ] goal_sizes = [ env.observation_space.spaces['latent_desired_goal'].low.size, env.observation_space.spaces['image_desired_goal'].low.size, env.observation_space.spaces['state_desired_goal'].low.size, 7 ] observation_keys = [ 'latent_observation', 'image_observation', 'state_observation', 'state_observation', ] goal_generation_dict = dict() for goal_key, goal_size, obs_key in zip( cached_goal_keys, goal_sizes, observation_keys, ): goal_generation_dict[goal_key] = [goal_size, obs_key] goal_dict = dict() policy = RandomPolicy(env.action_space) es = OUStrategy(action_space=env.action_space, theta=0) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) for goal_key in goal_generation_dict: goal_size, obs_key = goal_generation_dict[goal_key] goal_dict[goal_key] = np.zeros((num_goals, goal_size)) print('Generating Random Goals') for i in range(num_goals): if i % 50 == 0: print('Reset') env.reset_model() exploration_policy.reset() action = exploration_policy.get_action()[0] * action_scale obs, _, _, _ = env.step( action ) print(i) for goal_key in goal_generation_dict: goal_size, obs_key = goal_generation_dict[goal_key] goal_dict[goal_key][i, :] = obs[obs_key] np.save('/tmp/goals' + str(num_goals) + '.npy', goal_dict) return goal_dict
def example(variant): load_policy_file = variant.get('load_policy_file', None) if load_policy_file is not None and exists(load_policy_file): with tf.Session(): data = joblib.load(load_policy_file) print(data) policy = data['policy'] qf = data['qf'] replay_buffer = data['pool'] env = HalfCheetahEnv() es = OUStrategy(action_space=env.action_space) use_new_version = variant['use_new_version'] algorithm = DDPG( env, es, policy, qf, n_epochs=2, batch_size=1024, replay_pool=replay_buffer, use_new_version=use_new_version, ) algorithm.train() else: env = HalfCheetahEnv() es = OUStrategy(action_space=env.action_space) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="actor", env_spec=env.spec, ) use_new_version = variant['use_new_version'] algorithm = DDPG( env, es, policy, qf, n_epochs=2, batch_size=1024, use_new_version=use_new_version, ) algorithm.train()
def run_linear_ocm_exp(variant): from railrl.tf.ddpg import DDPG from railrl.envs.flattened_product_box import FlattenedProductBox from railrl.exploration_strategies.ou_strategy import OUStrategy from railrl.tf.policies.nn_policy import FeedForwardPolicy from railrl.qfunctions.nn_qfunction import FeedForwardCritic from railrl.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented ) from railrl.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ seed = variant['seed'] algo_params = variant['algo_params'] env_class = variant['env_class'] env_params = variant['env_params'] memory_dim = variant['memory_dim'] ou_params = variant['ou_params'] set_seed(seed) """ Code for running the experiment. """ env = env_class(**env_params) env = ContinuousMemoryAugmented( env, num_memory_states=memory_dim, ) env = FlattenedProductBox(env) qf = FeedForwardCritic( name_or_scope="critic", env_spec=env.spec, ) policy = FeedForwardPolicy( name_or_scope="policy", env_spec=env.spec, ) es = OUStrategy( env_spec=env.spec, **ou_params ) algorithm = DDPG( env, es, policy, qf, **algo_params ) algorithm.train()
def example(variant): load_policy_file = variant.get('load_policy_file', None) if not load_policy_file == None and exists(load_policy_file): data = joblib.load(load_policy_file) algorithm = data['algorithm'] epochs = algorithm.num_epochs - data['epoch'] algorithm.num_epochs = epochs use_gpu = variant['use_gpu'] if use_gpu and ptu.gpu_enabled(): algorithm.cuda() algorithm.train() else: es_min_sigma = variant['es_min_sigma'] es_max_sigma = variant['es_max_sigma'] num_epochs = variant['num_epochs'] batch_size = variant['batch_size'] use_gpu = variant['use_gpu'] dueling = variant['dueling'] env = normalize(gym_env('Reacher-v1')) es = OUStrategy( max_sigma=es_max_sigma, min_sigma=es_min_sigma, action_space=env.action_space, ) if dueling: qf = FeedForwardDuelingQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) else: qf = FeedForwardQFunction( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) policy = FeedForwardPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, 100, ) algorithm = DDPG( env, qf, policy, es, num_epochs=num_epochs, batch_size=batch_size, ) if use_gpu: algorithm.cuda() algorithm.train()
def experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_dim qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, hidden_sizes=[400, 300], ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, hidden_sizes=[400, 300], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): imsize = variant['imsize'] history = variant['history'] #env = InvertedDoublePendulumEnv()#gym.make(variant['env_id']) # env = SawyerXYZEnv() env = RandomGoalPusher2DEnv() partial_obs_size = env.obs_dim env = NormalizedBoxEnv( ImageMujocoWithObsEnv(env, imsize=imsize, keep_prev=history - 1, init_camera=variant['init_camera'])) # es = GaussianStrategy( # action_space=env.action_space, # ) es = OUStrategy(action_space=env.action_space) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=history, added_fc_input_size=action_dim + partial_obs_size, **variant['cnn_params']) policy = CNNPolicy( input_width=imsize, input_height=imsize, added_fc_input_size=partial_obs_size, output_size=action_dim, input_channels=history, **variant['cnn_params'], output_activation=torch.tanh, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = DDPG( env, qf=qf, policy=policy, # qf_weight_decay=.01, exploration_policy=exploration_policy, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def her_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) if 'history_len' in variant: history_len = variant['history_len'] env = MultiTaskHistoryEnv(env, history_len=history_len) if variant.get('make_silent_env', True): env = MultitaskEnvToSilentMultitaskEnv(env) if variant['normalize']: env = NormalizedBoxEnv(env) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space, **variant['es_kwargs']) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, **variant['es_kwargs'], ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, **variant['es_kwargs'], ) else: raise Exception("Invalid type: " + exploration_type) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size goal_dim = env.goal_space.low.size qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs']) policy = TanhMlpPolicy(input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs']) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env = variant['env_class']() obs_dim = int(np.prod(env.observation_space.low.shape)) action_dim = int(np.prod(env.action_space.low.shape)) vectorized = variant['algo_params']['tdm_kwargs']['vectorized'] if vectorized: qf = VectorizedDiscreteQFunction(observation_dim=int( np.prod(env.observation_space.low.shape)), action_dim=env.action_space.n, goal_dim=env.goal_dim, **variant['qf_params']) policy = ArgmaxDiscreteTdmPolicy(qf, **variant['policy_params']) else: qf = FlattenMlp(input_size=int(np.prod(env.observation_space.shape)) + env.goal_dim + 1, output_size=env.action_space.n, **variant['qf_params']) policy = ArgmaxDiscretePolicy(qf) es = OUStrategy( action_space=env.action_space, theta=0.1, max_sigma=0.1, min_sigma=0.1, ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = HerReplayBuffer(env=env, **variant['her_replay_buffer_params']) qf_criterion = variant['qf_criterion_class']( **variant['qf_criterion_params']) algo_params = variant['algo_params'] algo_params['ddpg_kwargs']['qf_criterion'] = qf_criterion plotter = Simple1DTdmDiscretePlotter( tdm=qf, location_lst=np.array([-5, 0, 5]), goal_lst=np.array([-5, 0, 5]), max_tau=algo_params['tdm_kwargs']['max_tau'], grid_size=10, ) algo_params['ddpg_kwargs']['plotter'] = plotter algorithm = TdmDdpg(env, qf=qf, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_params) algorithm.to(ptu.device) algorithm.train()
def tdm_td3_experiment(variant): env = variant['env_class'](**variant['env_kwargs']) tdm_normalizer = None qf1 = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) qf2 = TdmQf(env=env, vectorized=True, tdm_normalizer=tdm_normalizer, **variant['qf_kwargs']) policy = TdmPolicy(env=env, tdm_normalizer=tdm_normalizer, **variant['policy_kwargs']) exploration_type = variant['exploration_type'] if exploration_type == 'ou': es = OUStrategy(action_space=env.action_space) elif exploration_type == 'gaussian': es = GaussianStrategy( action_space=env.action_space, max_sigma=0.1, min_sigma=0.1, # Constant sigma ) elif exploration_type == 'epsilon': es = EpsilonGreedy( action_space=env.action_space, prob_random_action=0.1, ) else: raise Exception("Invalid type: " + exploration_type) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = variant['replay_buffer_class']( env=env, **variant['replay_buffer_kwargs']) qf_criterion = variant['qf_criterion_class']() algo_kwargs = variant['algo_kwargs'] algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion algo_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer algorithm = TdmTd3(env, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, policy=policy, exploration_policy=exploration_policy, **algo_kwargs) algorithm.to(ptu.device) algorithm.train()
def run_linear_ocm_exp(variant): from railrl.tf.ddpg import DDPG from railrl.envs.memory.continuous_memory_augmented import ( ContinuousMemoryAugmented) from railrl.envs.memory.one_char_memory import ( OneCharMemoryEndOnly, ) from railrl.launchers.launcher_util import ( set_seed, ) """ Set up experiment variants. """ H = variant['H'] seed = variant['seed'] num_values = variant['num_values'] algo_params = variant['algo_params'] set_seed(seed) onehot_dim = num_values + 1 env_action_dim = num_values + 1 """ Code for running the experiment. """ env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True) env = ContinuousMemoryAugmented( env, num_memory_states=onehot_dim, ) # env = FlattenedProductBox(env) # qf = FeedForwardCritic( # name_or_scope="critic", # env_spec=env.spec, # ) qf = MlpMemoryQFunction( name_or_scope="critic", env_spec=env.spec, ) policy = ActionAwareMemoryPolicy( name_or_scope="noisy_policy", action_dim=env_action_dim, memory_dim=memory_dim, env_spec=env.spec, ) es = OUStrategy(env_spec=env.spec) algorithm = DDPG(env, es, policy, qf, **algo_params) algorithm.train()
def example(variant): env = variant['env_class']() env = normalize(env) es = OUStrategy(action_space=env.action_space) qf = NafPolicy( int(env.observation_space.flat_dim), int(env.action_space.flat_dim), 100, ) algorithm = NAF(env, naf_policy=qf, exploration_strategy=es, **variant['algo_params']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): env_params = variant['env_params'] env = SawyerXYReachingEnv(**env_params) obs_dim = env.observation_space.low.size action_dim = env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[100, 100], ) qf2 = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, hidden_sizes=[100, 100], ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, hidden_sizes=[100, 100], ) # es = GaussianStrategy( # action_space=env.action_space, # **variant['es_kwargs'] # ) # es = EpsilonGreedy( # action_space=env.action_space, # prob_random_action=0.2, # ) es = OUStrategy( action_space=env.action_space, **variant['es_kwargs'] ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) algorithm = TD3( env, qf1=qf1, qf2=qf2, policy=policy, exploration_policy=exploration_policy, **variant['algo_kwargs'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train()