Esempio n. 1
0
	def run_experiment(self):
		all_imgs = []
		policy = OUStrategy(env.action_space)
		for i in range(self.num_episodes):
			state = self.env.reset()
			img = ptu.from_numpy(state['image_observation']).view(1, 6912)
			latent_state = self.vae.encode(img)[0]

			true_curr = state['image_observation'] * 255.0
			all_imgs.append(ptu.from_numpy(true_curr).view(3, 48, 48))

			actions = []
			for j in range(self.episode_len):
				u = policy.get_action_from_raw_action(env.action_space.sample())
				actions.append(u)
				state = self.env.step(u)[0]
				true_curr = state['image_observation'] * 255.0
				all_imgs.append(ptu.from_numpy(true_curr).view(3, 48, 48))

			pred_curr = self.vae.decode(latent_state)[0] * 255.0
			all_imgs.append(pred_curr.view(3, 48, 48))

			for j in range(self.episode_len):
				u = ptu.from_numpy(actions[j]).view(1, 2)
				latent_state = self.vae.process_dynamics(latent_state, u)
				pred_curr = self.vae.decode(latent_state)[0] * 255.0
				all_imgs.append(pred_curr.view(3, 48, 48))

		all_imgs = torch.stack(all_imgs)
		save_image(
	        all_imgs.data,
	        "/home/khazatsky/rail/data/rail-khazatsky/sasha/dynamics_visualizer/dynamics.png",
	        nrow=self.episode_len + 1,
	    )
Esempio n. 2
0
def generate_vae_dataset(
    N=10000,
    test_p=0.9,
    use_cached=True,
    imsize=84,
    show=False,
    dataset_path=None,
):
    filename = "/tmp/sawyer_push_new_easy_wider2_" + str(N) + ".npy"
    info = {}
    if dataset_path is not None:
        filename = local_path_from_s3_or_local_path(dataset_path)
        dataset = np.load(filename)
    elif use_cached and osp.isfile(filename):
        dataset = np.load(filename)
        print("loaded data from saved file", filename)
    else:
        now = time.time()
        env = SawyerPushXYEasyEnv(hide_goal=True)
        env = ImageMujocoEnv(
            env,
            imsize,
            transpose=True,
            init_camera=sawyer_init_camera_zoomed_in,
            # init_camera=sawyer_init_camera,
            normalize=True,
        )
        info['env'] = env
        policy = OUStrategy(env.action_space)

        dataset = np.zeros((N, imsize * imsize * 3))
        for i in range(N):
            # env.reset()
            if i % 100 == 0:
                g = env.sample_goal_for_rollout()
                env.set_goal(g)
                policy.reset()
            u = policy.get_action_from_raw_action(env.action_space.sample())
            img = env.step(u)[0]
            dataset[i, :] = img
            if show:
                # env.render()
                cv2.imshow('img', img.reshape(3, 84, 84).transpose())
                cv2.waitKey(1)
        print("done making training data", filename, time.time() - now)
        np.save(filename, dataset)

    n = int(N * test_p)
    train_dataset = dataset[:n, :]
    test_dataset = dataset[n:, :]
    return train_dataset, test_dataset, info
def experiment(variant):
    # env = HalfCheetahEnv()
    # env = PointEnv()
    env = gym_env("Pendulum-v0")
    # env = HopperEnv()
    horizon = variant['algo_params']['max_path_length']
    env = TimeLimitedEnv(env, horizon)
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf_hidden_sizes = variant['qf_hidden_sizes']
    qf = EasyVQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
        qf_hidden_sizes,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    algorithm = EasyVQLearning(env,
                               exploration_strategy=es,
                               qf=qf,
                               policy=policy,
                               **variant['algo_params'])
    algorithm.train()
    return algorithm.final_score
Esempio n. 4
0
def get_exploration_strategy(variant, env):
    from railrl.exploration_strategies.epsilon_greedy import EpsilonGreedy
    from railrl.exploration_strategies.gaussian_strategy import GaussianStrategy
    from railrl.exploration_strategies.ou_strategy import OUStrategy
    exploration_type = variant.get('exploration_type', 'epsilon')
    exploration_noise = variant.get('exploration_noise', 0.1)
    if exploration_type == 'ou':
        es = OUStrategy(
            action_space=env.action_space,
            max_sigma=exploration_noise,
            min_sigma=exploration_noise,  # Constant sigma
        )
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=exploration_noise,
            min_sigma=exploration_noise,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=exploration_noise,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    return es
Esempio n. 5
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    if variant['multitask']:
        env = MultitaskEnvToSilentMultitaskEnv(env)
    env = NormalizedBoxEnv(env, **variant['normalize_kwargs'])

    observation_dim = int(np.prod(env.observation_space.low.shape))
    action_dim = int(np.prod(env.action_space.low.shape))
    obs_normalizer = TorchFixedNormalizer(observation_dim)
    action_normalizer = TorchFixedNormalizer(action_dim)
    delta_normalizer = TorchFixedNormalizer(observation_dim)
    model = DynamicsModel(observation_dim=observation_dim,
                          action_dim=action_dim,
                          obs_normalizer=obs_normalizer,
                          action_normalizer=action_normalizer,
                          delta_normalizer=delta_normalizer,
                          **variant['model_kwargs'])
    mpc_controller = MPCController(env, model, env.cost_fn,
                                   **variant['mpc_controller_kwargs'])
    es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=mpc_controller,
    )
    algo = DistanceModelTrainer(env,
                                model,
                                mpc_controller,
                                exploration_policy=exploration_policy,
                                obs_normalizer=obs_normalizer,
                                action_normalizer=action_normalizer,
                                delta_normalizer=delta_normalizer,
                                **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        algo.to(ptu.device)
    algo.train()
def example(variant):
    env = variant['env_class']()
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    es = OUStrategy(action_space=env.action_space, **variant['es_kwargs'])
    obs_dim = int(np.prod(env.observation_space.low.shape))
    action_dim = int(np.prod(env.action_space.low.shape))
    qf = FlattenMlp(input_size=obs_dim + action_dim,
                    output_size=1,
                    **variant['vf_params'])
    vf = FlattenMlp(input_size=obs_dim, output_size=1, **variant['vf_params'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           **variant['policy_params'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = N3DPG(env,
                      qf=qf,
                      vf=vf,
                      policy=policy,
                      exploration_policy=exploration_policy,
                      **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 7
0
def experiment(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        400,
        300,
    )
    epoch_discount_schedule_class = variant['epoch_discount_schedule_class']
    epoch_discount_schedule = epoch_discount_schedule_class(
        **variant['epoch_discount_schedule_params'])
    algorithm = DDPG(env,
                     exploration_strategy=es,
                     qf=qf,
                     policy=policy,
                     epoch_discount_schedule=epoch_discount_schedule,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 8
0
def example(variant):
    env = HalfCheetahEnv()
    if variant['normalize']:
        env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    policy = FeedForwardPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        32,
        32,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 9
0
def example(variant):
    env = variant['env_class']()
    env = NormalizedBoxEnv(env)
    obs_dim = get_dim(env.observation_space)
    action_dim = get_dim(env.action_space)
    es = OUStrategy(action_space=env.action_space)
    qf = FeedForwardQFunction(
        obs_dim,
        action_dim,
        **variant['qf_params']
    )
    policy = FeedForwardPolicy(
        obs_dim,
        action_dim,
        400,
        300,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf,
        policy,
        exploration_policy,
        **variant['algo_params']
    )
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 10
0
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYZReachingEnv(**env_params)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    hidden_size = variant['hidden_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[hidden_size, hidden_size],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[hidden_size, hidden_size],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[hidden_size, hidden_size],
    )
    es = OUStrategy(action_space=env.action_space, **variant['es_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 11
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    env = NormalizedBoxEnv(env, **variant['normalize_kwargs'])
    if variant['multitask']:
        env = MultitaskToFlatEnv(env)
    es = OUStrategy(action_space=env.action_space, **variant['ou_kwargs'])
    obs_dim = int(env.observation_space.flat_dim)
    action_dim = int(env.action_space.flat_dim)
    obs_normalizer = TorchFixedNormalizer(obs_dim)
    action_normalizer = TorchFixedNormalizer(action_dim)
    qf = MlpQf(input_size=obs_dim + action_dim,
               output_size=1,
               obs_normalizer=obs_normalizer,
               action_normalizer=action_normalizer,
               **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim,
                           output_size=action_dim,
                           obs_normalizer=obs_normalizer,
                           **variant['policy_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf,
                     policy,
                     exploration_policy,
                     obs_normalizer=obs_normalizer,
                     action_normalizer=action_normalizer,
                     **variant['algo_kwargs'])
    algorithm.train()
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYZReachingEnv(**env_params)
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(env,
                     qf=qf,
                     policy=policy,
                     exploration_policy=exploration_policy,
                     **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
Esempio n. 13
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    # env = NormalizedBoxEnv(env)
    # tdm_normalizer = TdmNormalizer(
    #     env,
    #     vectorized=True,
    #     max_tau=variant['algo_kwargs']['tdm_kwargs']['max_tau'],
    # )
    tdm_normalizer = None
    qf = TdmQf(env=env,
               vectorized=True,
               tdm_normalizer=tdm_normalizer,
               **variant['qf_kwargs'])
    policy = TdmPolicy(env=env,
                       tdm_normalizer=tdm_normalizer,
                       **variant['policy_kwargs'])
    es = OUStrategy(action_space=env.action_space, **variant['es_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = HerReplayBuffer(env=env,
                                    **variant['her_replay_buffer_kwargs'])
    qf_criterion = variant['qf_criterion_class']()
    ddpg_tdm_kwargs = variant['algo_kwargs']
    ddpg_tdm_kwargs['ddpg_kwargs']['qf_criterion'] = qf_criterion
    ddpg_tdm_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer
    algorithm = TdmDdpg(env,
                        qf=qf,
                        replay_buffer=replay_buffer,
                        policy=policy,
                        exploration_policy=exploration_policy,
                        **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 14
0
def experiment(variant):
    env = NormalizedBoxEnv(variant['env_class']())

    obs_dim = int(np.prod(env.observation_space.low.shape))
    action_dim = int(np.prod(env.action_space.low.shape))
    gcm = FlattenMlp(input_size=env.goal_dim + obs_dim + action_dim + 1,
                     output_size=env.goal_dim,
                     **variant['gcm_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim + env.goal_dim + 1,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    es = OUStrategy(
        action_space=env.action_space,
        theta=0.1,
        max_sigma=0.1,
        min_sigma=0.1,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = HerReplayBuffer(env=env,
                                    **variant['her_replay_buffer_kwargs'])
    gcm_criterion = variant['gcm_criterion_class'](
        **variant['gcm_criterion_kwargs'])
    algo_kwargs = variant['algo_kwargs']
    algo_kwargs['base_kwargs']['replay_buffer'] = replay_buffer
    algorithm = GcmDdpg(env,
                        gcm=gcm,
                        policy=policy,
                        exploration_policy=exploration_policy,
                        gcm_criterion=gcm_criterion,
                        **algo_kwargs)
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 15
0
def experiment(variant):
    if variant['multitask']:
        env = MultitaskFullVAEPoint2DEnv(
            **variant['env_kwargs'])  # used point2d-conv-sweep/run1/id4
        env = MultitaskToFlatEnv(env)
    # else:
    # env = Pusher2DEnv(**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(env,
                    qf1=qf1,
                    qf2=qf2,
                    policy=policy,
                    exploration_policy=exploration_policy,
                    **variant['algo_kwargs'])
    if variant["use_gpu"]:
        gpu_id = variant["gpu_id"]
        ptu.set_gpu_mode(True)
        ptu.set_device(gpu_id)
        algorithm.to(ptu.device)
        env._wrapped_env.vae.to(ptu.device)
    algorithm.train()
Esempio n. 16
0
def experiment(variant):
    env = variant['env_class']()

    obs_dim = int(np.prod(env.observation_space.low.shape))
    action_dim = int(np.prod(env.action_space.low.shape))
    vectorized = variant['algo_params']['tdm_kwargs']['vectorized']
    # qf = StructuredQF(
    #     observation_dim=obs_dim,
    #     action_dim=action_dim,
    #     goal_dim=env.goal_dim,
    #     output_size=env.goal_dim if vectorized else 1,
    #     **variant['qf_params']
    # )
    qf = OneHotTauQF(observation_dim=obs_dim,
                     action_dim=action_dim,
                     goal_dim=env.goal_dim,
                     output_size=env.goal_dim if vectorized else 1,
                     **variant['qf_params'])
    vf = FlattenMlp(input_size=obs_dim + env.goal_dim + 1,
                    output_size=env.goal_dim if vectorized else 1,
                    **variant['vf_params'])
    policy = MlpPolicy(input_size=obs_dim + env.goal_dim + 1,
                       output_size=action_dim,
                       **variant['policy_params'])
    es = OUStrategy(
        action_space=env.action_space,
        theta=0.1,
        max_sigma=0.1,
        min_sigma=0.1,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = HerReplayBuffer(env=env,
                                    **variant['her_replay_buffer_params'])
    qf_criterion = variant['qf_criterion_class'](
        **variant['qf_criterion_params'])
    algo_params = variant['algo_params']
    algo_params['n3dpg_kwargs']['qf_criterion'] = qf_criterion
    plotter = Simple1DTdmPlotter(
        tdm=qf,
        # location_lst=np.array([-10, 0, 10]),
        # goal_lst=np.array([-10, 0, 5]),
        location_lst=np.array([-5, 0, 5]),
        goal_lst=np.array([-5, 0, 5]),
        max_tau=algo_params['tdm_kwargs']['max_tau'],
        grid_size=10,
    )
    algo_params['n3dpg_kwargs']['plotter'] = plotter
    algorithm = TdmN3dpg(env,
                         qf=qf,
                         vf=vf,
                         replay_buffer=replay_buffer,
                         policy=policy,
                         exploration_policy=exploration_policy,
                         **algo_params)
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 17
0
def generate_vae_dataset(
        N=10000, test_p=0.9, use_cached=True, imsize=84, show=False,
        dataset_path=None, policy_path=None, action_space_sampling=False, env_class=SawyerDoorEnv, env_kwargs=None,
        init_camera=sawyer_door_env_camera_v2,
):
    if policy_path is not None:
        filename = "/tmp/sawyer_door_pull_open_oracle+random_policy_data_closer_zoom_action_limited" + str(N) + ".npy"
    elif action_space_sampling:
        filename = "/tmp/sawyer_door_pull_open_zoomed_in_action_space_sampling" + str(N) + ".npy"
    else:
        filename = "/tmp/sawyer_door_pull_open" + str(N) + ".npy"
    info = {}
    if dataset_path is not None:
        filename = local_path_from_s3_or_local_path(dataset_path)
        dataset = np.load(filename)
    elif use_cached and osp.isfile(filename):
        dataset = np.load(filename)
        print("loaded data from saved file", filename)
    else:
        now = time.time()
        env = env_class(**env_kwargs)
        env = ImageEnv(
            env, imsize,
            transpose=True,
            init_camera=init_camera,
            normalize=True,
        )
        info['env'] = env
        policy = RandomPolicy(env.action_space)
        es = OUStrategy(action_space=env.action_space, theta=0)
        exploration_policy = PolicyWrappedWithExplorationStrategy(
            exploration_strategy=es,
            policy=policy,
        )
        env.wrapped_env.reset()
        dataset = np.zeros((N, imsize * imsize * 3), dtype=np.uint8)
        for i in range(N):
            if i % 20==0:
                env.reset_model()
                exploration_policy.reset()
            for _ in range(10):
                action = exploration_policy.get_action()[0]
                env.wrapped_env.step(
                    action
                )
            # env.set_to_goal_angle(env.get_goal()['state_desired_goal'])
            img = env._get_flat_img()
            dataset[i, :] = unormalize_image(img)
            if show:
                cv2.imshow('img', img.reshape(3, 84, 84).transpose())
                cv2.waitKey(1)
            print(i)
        print("done making training data", filename, time.time() - now)
        np.save(filename, dataset)

    n = int(N * test_p)
    train_dataset = dataset[:n, :]
    test_dataset = dataset[n:, :]
    return train_dataset, test_dataset, info
Esempio n. 18
0
def experiment(variant):
    # if variant['multitask']:
    #     env = MultitaskPoint2DEnv(**variant['env_kwargs'])
    #     env = MultitaskToFlatEnv(env)
    # else:
        # env = Pusher2DEnv(**variant['env_kwargs'])
    env_name = variant["env_name"]
    env = gym.make(env_name)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(
        env,
        qf1=qf1,
        qf2=qf2,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()
def generate_goal_data_set(env=None, num_goals=1000, use_cached_dataset=False,
                           action_scale=1 / 10):
    if use_cached_dataset and osp.isfile(
            '/tmp/goals' + str(num_goals) + '.npy'):
        goal_dict = np.load('/tmp/goals' + str(num_goals) + '.npy').item()
        print("loaded data from saved file")
        return goal_dict
    cached_goal_keys = [
        'latent_desired_goal',
        'image_desired_goal',
        'state_desired_goal',
        'joint_desired_goal',
    ]
    goal_sizes = [
        env.observation_space.spaces['latent_desired_goal'].low.size,
        env.observation_space.spaces['image_desired_goal'].low.size,
        env.observation_space.spaces['state_desired_goal'].low.size,
        7
    ]
    observation_keys = [
        'latent_observation',
        'image_observation',
        'state_observation',
        'state_observation',
    ]
    goal_generation_dict = dict()
    for goal_key, goal_size, obs_key in zip(
            cached_goal_keys,
            goal_sizes,
            observation_keys,
    ):
        goal_generation_dict[goal_key] = [goal_size, obs_key]
    goal_dict = dict()
    policy = RandomPolicy(env.action_space)
    es = OUStrategy(action_space=env.action_space, theta=0)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    for goal_key in goal_generation_dict:
        goal_size, obs_key = goal_generation_dict[goal_key]
        goal_dict[goal_key] = np.zeros((num_goals, goal_size))
    print('Generating Random Goals')
    for i in range(num_goals):
        if i % 50 == 0:
            print('Reset')
            env.reset_model()
            exploration_policy.reset()
        action = exploration_policy.get_action()[0] * action_scale
        obs, _, _, _ = env.step(
            action
        )
        print(i)
        for goal_key in goal_generation_dict:
            goal_size, obs_key = goal_generation_dict[goal_key]
            goal_dict[goal_key][i, :] = obs[obs_key]
    np.save('/tmp/goals' + str(num_goals) + '.npy', goal_dict)
    return goal_dict
Esempio n. 20
0
def example(variant):
    load_policy_file = variant.get('load_policy_file', None)
    if load_policy_file is not None and exists(load_policy_file):
        with tf.Session():
            data = joblib.load(load_policy_file)
            print(data)
            policy = data['policy']
            qf = data['qf']
            replay_buffer = data['pool']
        env = HalfCheetahEnv()
        es = OUStrategy(action_space=env.action_space)
        use_new_version = variant['use_new_version']
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            n_epochs=2,
            batch_size=1024,
            replay_pool=replay_buffer,
            use_new_version=use_new_version,
        )
        algorithm.train()
    else:
        env = HalfCheetahEnv()
        es = OUStrategy(action_space=env.action_space)
        qf = FeedForwardCritic(
            name_or_scope="critic",
            env_spec=env.spec,
        )
        policy = FeedForwardPolicy(
            name_or_scope="actor",
            env_spec=env.spec,
        )
        use_new_version = variant['use_new_version']
        algorithm = DDPG(
            env,
            es,
            policy,
            qf,
            n_epochs=2,
            batch_size=1024,
            use_new_version=use_new_version,
        )
        algorithm.train()
Esempio n. 21
0
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg import DDPG
    from railrl.envs.flattened_product_box import FlattenedProductBox
    from railrl.exploration_strategies.ou_strategy import OUStrategy
    from railrl.tf.policies.nn_policy import FeedForwardPolicy
    from railrl.qfunctions.nn_qfunction import FeedForwardCritic
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented
    )
    from railrl.launchers.launcher_util import (
        set_seed,
    )

    """
    Set up experiment variants.
    """
    seed = variant['seed']
    algo_params = variant['algo_params']
    env_class = variant['env_class']
    env_params = variant['env_params']
    memory_dim = variant['memory_dim']
    ou_params = variant['ou_params']

    set_seed(seed)

    """
    Code for running the experiment.
    """

    env = env_class(**env_params)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=memory_dim,
    )
    env = FlattenedProductBox(env)

    qf = FeedForwardCritic(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = FeedForwardPolicy(
        name_or_scope="policy",
        env_spec=env.spec,
    )
    es = OUStrategy(
        env_spec=env.spec,
        **ou_params
    )
    algorithm = DDPG(
        env,
        es,
        policy,
        qf,
        **algo_params
    )

    algorithm.train()
Esempio n. 22
0
def example(variant):
    load_policy_file = variant.get('load_policy_file', None)
    if not load_policy_file == None and exists(load_policy_file):
        data = joblib.load(load_policy_file)
        algorithm = data['algorithm']
        epochs = algorithm.num_epochs - data['epoch']
        algorithm.num_epochs = epochs
        use_gpu = variant['use_gpu']
        if use_gpu and ptu.gpu_enabled():
            algorithm.cuda()
        algorithm.train()
    else:
        es_min_sigma = variant['es_min_sigma']
        es_max_sigma = variant['es_max_sigma']
        num_epochs = variant['num_epochs']
        batch_size = variant['batch_size']
        use_gpu = variant['use_gpu']
        dueling = variant['dueling']

        env = normalize(gym_env('Reacher-v1'))
        es = OUStrategy(
            max_sigma=es_max_sigma,
            min_sigma=es_min_sigma,
            action_space=env.action_space,
        )
        if dueling:
            qf = FeedForwardDuelingQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        else:
            qf = FeedForwardQFunction(
                int(env.observation_space.flat_dim),
                int(env.action_space.flat_dim),
                100,
                100,
            )
        policy = FeedForwardPolicy(
            int(env.observation_space.flat_dim),
            int(env.action_space.flat_dim),
            100,
            100,
        )
        algorithm = DDPG(
            env,
            qf,
            policy,
            es,
            num_epochs=num_epochs,
            batch_size=batch_size,
        )
        if use_gpu:
            algorithm.cuda()
        algorithm.train()
Esempio n. 23
0
def experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    goal_dim = env.goal_dim
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim + goal_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim + goal_dim,
        output_size=1,
        hidden_sizes=[400, 300],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim + goal_dim,
        output_size=action_dim,
        hidden_sizes=[400, 300],
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = variant['replay_buffer_class'](
        env=env, **variant['replay_buffer_kwargs'])
    algorithm = HerTd3(env,
                       qf1=qf1,
                       qf2=qf2,
                       policy=policy,
                       exploration_policy=exploration_policy,
                       replay_buffer=replay_buffer,
                       **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
def experiment(variant):
    imsize = variant['imsize']
    history = variant['history']

    #env = InvertedDoublePendulumEnv()#gym.make(variant['env_id'])
    #    env = SawyerXYZEnv()
    env = RandomGoalPusher2DEnv()
    partial_obs_size = env.obs_dim
    env = NormalizedBoxEnv(
        ImageMujocoWithObsEnv(env,
                              imsize=imsize,
                              keep_prev=history - 1,
                              init_camera=variant['init_camera']))
    #    es = GaussianStrategy(
    #        action_space=env.action_space,
    #    )
    es = OUStrategy(action_space=env.action_space)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size

    qf = MergedCNN(input_width=imsize,
                   input_height=imsize,
                   output_size=1,
                   input_channels=history,
                   added_fc_input_size=action_dim + partial_obs_size,
                   **variant['cnn_params'])

    policy = CNNPolicy(
        input_width=imsize,
        input_height=imsize,
        added_fc_input_size=partial_obs_size,
        output_size=action_dim,
        input_channels=history,
        **variant['cnn_params'],
        output_activation=torch.tanh,
    )

    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = DDPG(
        env,
        qf=qf,
        policy=policy,
        #        qf_weight_decay=.01,
        exploration_policy=exploration_policy,
        **variant['algo_params'])

    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 25
0
def her_td3_experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    if 'history_len' in variant:
        history_len = variant['history_len']
        env = MultiTaskHistoryEnv(env, history_len=history_len)
    if variant.get('make_silent_env', True):
        env = MultitaskEnvToSilentMultitaskEnv(env)
    if variant['normalize']:
        env = NormalizedBoxEnv(env)
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space, **variant['es_kwargs'])
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            **variant['es_kwargs'],
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            **variant['es_kwargs'],
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    goal_dim = env.goal_space.low.size
    qf1 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    qf2 = FlattenMlp(input_size=obs_dim + action_dim + goal_dim,
                     output_size=1,
                     **variant['qf_kwargs'])
    policy = TanhMlpPolicy(input_size=obs_dim + goal_dim,
                           output_size=action_dim,
                           **variant['policy_kwargs'])
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = variant['replay_buffer_class'](
        env=env, **variant['replay_buffer_kwargs'])
    algorithm = HerTd3(env,
                       qf1=qf1,
                       qf2=qf2,
                       policy=policy,
                       exploration_policy=exploration_policy,
                       replay_buffer=replay_buffer,
                       **variant['algo_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 26
0
def experiment(variant):
    env = variant['env_class']()

    obs_dim = int(np.prod(env.observation_space.low.shape))
    action_dim = int(np.prod(env.action_space.low.shape))
    vectorized = variant['algo_params']['tdm_kwargs']['vectorized']
    if vectorized:
        qf = VectorizedDiscreteQFunction(observation_dim=int(
            np.prod(env.observation_space.low.shape)),
                                         action_dim=env.action_space.n,
                                         goal_dim=env.goal_dim,
                                         **variant['qf_params'])
        policy = ArgmaxDiscreteTdmPolicy(qf, **variant['policy_params'])
    else:
        qf = FlattenMlp(input_size=int(np.prod(env.observation_space.shape)) +
                        env.goal_dim + 1,
                        output_size=env.action_space.n,
                        **variant['qf_params'])
        policy = ArgmaxDiscretePolicy(qf)
    es = OUStrategy(
        action_space=env.action_space,
        theta=0.1,
        max_sigma=0.1,
        min_sigma=0.1,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = HerReplayBuffer(env=env,
                                    **variant['her_replay_buffer_params'])
    qf_criterion = variant['qf_criterion_class'](
        **variant['qf_criterion_params'])
    algo_params = variant['algo_params']
    algo_params['ddpg_kwargs']['qf_criterion'] = qf_criterion
    plotter = Simple1DTdmDiscretePlotter(
        tdm=qf,
        location_lst=np.array([-5, 0, 5]),
        goal_lst=np.array([-5, 0, 5]),
        max_tau=algo_params['tdm_kwargs']['max_tau'],
        grid_size=10,
    )
    algo_params['ddpg_kwargs']['plotter'] = plotter
    algorithm = TdmDdpg(env,
                        qf=qf,
                        replay_buffer=replay_buffer,
                        policy=policy,
                        exploration_policy=exploration_policy,
                        **algo_params)
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 27
0
def tdm_td3_experiment(variant):
    env = variant['env_class'](**variant['env_kwargs'])
    tdm_normalizer = None
    qf1 = TdmQf(env=env,
                vectorized=True,
                tdm_normalizer=tdm_normalizer,
                **variant['qf_kwargs'])
    qf2 = TdmQf(env=env,
                vectorized=True,
                tdm_normalizer=tdm_normalizer,
                **variant['qf_kwargs'])
    policy = TdmPolicy(env=env,
                       tdm_normalizer=tdm_normalizer,
                       **variant['policy_kwargs'])
    exploration_type = variant['exploration_type']
    if exploration_type == 'ou':
        es = OUStrategy(action_space=env.action_space)
    elif exploration_type == 'gaussian':
        es = GaussianStrategy(
            action_space=env.action_space,
            max_sigma=0.1,
            min_sigma=0.1,  # Constant sigma
        )
    elif exploration_type == 'epsilon':
        es = EpsilonGreedy(
            action_space=env.action_space,
            prob_random_action=0.1,
        )
    else:
        raise Exception("Invalid type: " + exploration_type)
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    replay_buffer = variant['replay_buffer_class'](
        env=env, **variant['replay_buffer_kwargs'])
    qf_criterion = variant['qf_criterion_class']()
    algo_kwargs = variant['algo_kwargs']
    algo_kwargs['td3_kwargs']['qf_criterion'] = qf_criterion
    algo_kwargs['tdm_kwargs']['tdm_normalizer'] = tdm_normalizer
    algorithm = TdmTd3(env,
                       qf1=qf1,
                       qf2=qf2,
                       replay_buffer=replay_buffer,
                       policy=policy,
                       exploration_policy=exploration_policy,
                       **algo_kwargs)
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 28
0
def run_linear_ocm_exp(variant):
    from railrl.tf.ddpg import DDPG
    from railrl.envs.memory.continuous_memory_augmented import (
        ContinuousMemoryAugmented)
    from railrl.envs.memory.one_char_memory import (
        OneCharMemoryEndOnly, )
    from railrl.launchers.launcher_util import (
        set_seed, )
    """
    Set up experiment variants.
    """
    H = variant['H']
    seed = variant['seed']
    num_values = variant['num_values']
    algo_params = variant['algo_params']

    set_seed(seed)
    onehot_dim = num_values + 1

    env_action_dim = num_values + 1
    """
    Code for running the experiment.
    """

    env = OneCharMemoryEndOnly(n=num_values, num_steps=H, softmax_action=True)
    env = ContinuousMemoryAugmented(
        env,
        num_memory_states=onehot_dim,
    )
    # env = FlattenedProductBox(env)

    # qf = FeedForwardCritic(
    #     name_or_scope="critic",
    #     env_spec=env.spec,
    # )
    qf = MlpMemoryQFunction(
        name_or_scope="critic",
        env_spec=env.spec,
    )
    policy = ActionAwareMemoryPolicy(
        name_or_scope="noisy_policy",
        action_dim=env_action_dim,
        memory_dim=memory_dim,
        env_spec=env.spec,
    )
    es = OUStrategy(env_spec=env.spec)
    algorithm = DDPG(env, es, policy, qf, **algo_params)

    algorithm.train()
def example(variant):
    env = variant['env_class']()
    env = normalize(env)
    es = OUStrategy(action_space=env.action_space)
    qf = NafPolicy(
        int(env.observation_space.flat_dim),
        int(env.action_space.flat_dim),
        100,
    )
    algorithm = NAF(env,
                    naf_policy=qf,
                    exploration_strategy=es,
                    **variant['algo_params'])
    algorithm.to(ptu.device)
    algorithm.train()
Esempio n. 30
0
def experiment(variant):
    env_params = variant['env_params']
    env = SawyerXYReachingEnv(**env_params)
    obs_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[100, 100],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[100, 100],
    )
    policy = TanhMlpPolicy(
        input_size=obs_dim,
        output_size=action_dim,
        hidden_sizes=[100, 100],
    )
    # es = GaussianStrategy(
    #     action_space=env.action_space,
    #     **variant['es_kwargs']
    # )
    # es = EpsilonGreedy(
    #     action_space=env.action_space,
    #     prob_random_action=0.2,
    # )
    es = OUStrategy(
        action_space=env.action_space,
        **variant['es_kwargs']
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )
    algorithm = TD3(
        env,
        qf1=qf1,
        qf2=qf2,
        policy=policy,
        exploration_policy=exploration_policy,
        **variant['algo_kwargs']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()