コード例 #1
0
def experiment(variant):
    ptu._use_gpu = variant['gpu']

    env = NormalizedBoxEnv(gym.make(variant['env_name']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = NNQFunction(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[net_size, net_size],
    )
    if ptu.gpu_enabled():
        qf.cuda()

    policy = SamplingPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[net_size, net_size],
    )
    if ptu.gpu_enabled():
        policy.cuda()

    algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params'])

    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #2
0
def experiment(variant):
    render_q = True
    save_q_path = '/home/desteban/logs/two_q_plots'
    goal_positions = [(5, 0), (-5, 0), (0, 5), (0, -5)]

    q_fcn_positions = [(-2.5, 0.0), (0.0, 0.0), (2.5, 2.5)]

    n_demons = len(goal_positions)

    ptu._use_gpu = variant['gpu']

    env = NormalizedBoxEnv(
        MultiCompositionEnv(
            actuation_cost_coeff=30,
            distance_cost_coeff=1.0,
            goal_reward=10,
            init_sigma=0.1,
            goal_positions=goal_positions,
        ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = NNQFunction(obs_dim=obs_dim,
                     action_dim=action_dim,
                     hidden_sizes=(net_size, net_size))
    if ptu.gpu_enabled():
        qf.cuda()

    # _i_policy = TanhGaussianPolicy(
    policy = SamplingPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[net_size, net_size],
    )
    if ptu.gpu_enabled():
        policy.cuda()

    # QF Plot
    plotter = QFPolicyPlotter(
        qf=qf,
        policy=policy,
        obs_lst=q_fcn_positions,
        default_action=[np.nan, np.nan],
        n_samples=100,
        render=render_q,
        save_path=save_q_path,
    )
    variant['algo_params']['epoch_plotter'] = plotter
    # variant['algo_params']['epoch_plotter'] = None

    algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #3
0
def experiment(variant):
    ptu.set_gpu_mode(variant['gpu'])

    goal = variant['env_params'].get('goal')
    variant['env_params']['goal_poses'] = \
        [goal, (goal[0], 'any'), ('any', goal[1])]
    variant['env_params'].pop('goal')

    env = NormalizedBoxEnv(Pusher2D3DofMultiGoalEnv(**variant['env_params']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = NNQFunction(obs_dim=obs_dim,
                     action_dim=action_dim,
                     hidden_sizes=(net_size, net_size))
    if ptu.gpu_enabled():
        qf.cuda()

    # _i_policy = TanhGaussianPolicy(
    policy = SamplingPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[net_size, net_size],
    )
    if ptu.gpu_enabled():
        policy.cuda()

    replay_buffer = SimpleReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        np.prod(env.observation_space.shape),
        np.prod(env.action_space.shape),
    )
    variant['algo_params']['replay_buffer'] = replay_buffer

    # QF Plot
    variant['algo_params']['_epoch_plotter'] = None

    algorithm = SQL(
        env=env,
        training_env=env,
        save_environment=False,
        qf=qf,
        policy=policy,
        # algo_interface='torch',
        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train(online=True)

    return algorithm
コード例 #4
0
def experiment(variant):
    env = NormalizedBoxEnv(gym.make(variant['env_name']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    algorithm = SAC(explo_env=env,
                    policy=policy,
                    qf=qf,
                    vf=vf,
                    **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()
コード例 #5
0
def experiment(variant):
    ptu.set_gpu_mode(variant['gpu'])

    env = NormalizedBoxEnv(Pusher2D3DofGoalCompoEnv(**variant['env_params']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    n_unintentional = 2

    net_size = variant['net_size']
    u_qfs = [
        NNQFunction(obs_dim=obs_dim,
                    action_dim=action_dim,
                    hidden_sizes=(net_size, net_size))
        for _ in range(n_unintentional)
    ]
    # i_qf = AvgNNQFunction(obs_dim=obs_dim,
    i_qf = SumNNQFunction(obs_dim=obs_dim,
                          action_dim=action_dim,
                          q_functions=u_qfs)

    # _i_policy = TanhGaussianPolicy(
    u_policies = [
        StochasticPolicy(
            hidden_sizes=[net_size, net_size],
            obs_dim=obs_dim,
            action_dim=action_dim,
        ) for _ in range(n_unintentional)
    ]
    i_policy = StochasticPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    replay_buffer = MultiGoalReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        np.prod(env.observation_space.shape), np.prod(env.action_space.shape),
        n_unintentional)
    variant['algo_params']['replay_buffer'] = replay_buffer

    # QF Plot
    variant['algo_params']['_epoch_plotter'] = None

    algorithm = IUSQL(env=env,
                      training_env=env,
                      save_environment=False,
                      u_qfs=u_qfs,
                      u_policies=u_policies,
                      i_policy=i_policy,
                      i_qf=i_qf,
                      algo_interface='torch',
                      min_buffer_size=variant['algo_params']['batch_size'],
                      **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train(online=True)

    return algorithm
コード例 #6
0
def experiment(variant):
    ptu.set_gpu_mode(variant['gpu'])

    env = NormalizedBoxEnv(gym.make(variant['env_name']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']

    qf = NNQFunction(obs_dim=obs_dim,
                     action_dim=action_dim,
                     hidden_sizes=[net_size, net_size])
    policy = TanhMlpPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[net_size, net_size],
    )
    es = OUStrategy(
        action_space=env.action_space,
        mu=0,
        theta=0.15,
        max_sigma=0.3,
        min_sigma=0.3,
        decay_period=100000,
    )
    exploration_policy = PolicyWrappedWithExplorationStrategy(
        exploration_strategy=es,
        policy=policy,
    )

    replay_buffer = SimpleReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    variant['algo_params']['replay_buffer'] = replay_buffer

    # QF Plot
    # variant['algo_params']['epoch_plotter'] = None

    algorithm = DDPG(
        explo_env=env,
        # training_env=env,
        save_environment=False,
        policy=policy,
        explo_policy=exploration_policy,
        qf=qf,
        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #7
0
def experiment(variant):
    render_q = True
    save_q_path = '/home/desteban/logs/two_q_plots'
    goal_positions = [(5, 0), (-5, 0), (0, 5), (0, -5)]

    q_fcn_positions = [(-2.5, 0.0), (0.0, 0.0), (2.5, 2.5)]

    n_demons = len(goal_positions)

    ptu._use_gpu = variant['gpu']

    env = NormalizedBoxEnv(PusherEnv(goal=variant['env_params'].get('goal')))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = NNQFunction(obs_dim=obs_dim,
                     action_dim=action_dim,
                     hidden_sizes=(net_size, net_size))
    if ptu.gpu_enabled():
        qf.cuda()

    # _i_policy = TanhGaussianPolicy(
    policy = StochasticPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    if ptu.gpu_enabled():
        policy.cuda()

    # QF Plot
    variant['algo_params']['_epoch_plotter'] = None

    algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #8
0
def experiment(variant):
    ptu.set_gpu_mode(variant['gpu'])

    # env = NormalizedBoxEnv(
    #     Reacher2D3DofBulletEnv(**variant['env_params'])
    # )
    env = Reacher2D3DofBulletEnv(**variant['env_params'])
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    initial_conds = [
        [10, 5, 20, 0.2, 0.5, 0],
        [10, 5, 20, 0.1, 0.1, 0],
        [10, 5, 20, 0.15, 0.8, 0],
    ]

    for init_cond in initial_conds:
        env.add_initial_condition(robot_config=np.deg2rad(init_cond[:3]),
                                  tgt_state=init_cond[-3:])

    net_size = variant['net_size']
    # global_policy = TanhGaussianPolicy(
    global_policy = MlpPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    local_policies = [
        LinearGaussianPolicy(
            obs_dim=obs_dim,
            action_dim=action_dim,
            T=PATH_LENGTH,
        ) for _ in range(N_LOCAL_POLS)
    ]
    #
    # replay_buffer = FakeReplayBuffer()
    # variant['algo_params']['replay_buffer'] = replay_buffer
    #
    # # QF Plot
    # # variant['algo_params']['epoch_plotter'] = None

    algorithm = MDGPS(env=env,
                      eval_env=env,
                      save_environment=False,
                      local_policies=local_policies,
                      global_policy=global_policy,
                      **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #9
0
def experiment(variant):
    ptu._use_gpu = variant['gpu']
    env = NormalizedBoxEnv(gym.make(variant['env_name']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']

    qf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = SoftActorCritic(
        env=env,
        training_env=env,
        save_environment=False,
        policy=policy,
        qf=qf,
        vf=vf,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()

    algorithm.train()

    return algorithm
コード例 #10
0
    def forward(
        self,
        obs,
        deterministic=False,
    ):
        """
        :param obs: Observation
        :param deterministic: If True, do not sample
        """
        # TODO: HOW TO DETERMINISTIC???
        latent_shape = (*list(obs.shape)[:-1], self._action_dim)
        if deterministic:
            latent = torch.zeros(latent_shape)
        else:
            latent = self._latent_dist.sample(latent_shape).squeeze(-1)

        if ptu.gpu_enabled():
            latent = latent.cuda()

        h = torch.cat([obs, latent], dim=-1)
        # print('--- INPUT ---')
        # print(torch.cat([obs, latent], dim=-1)[:5, :])
        for i, fc in enumerate(self.fcs):
            # h = self.hidden_activation(fc(h))
            h = fc(h)
            if self.layer_norm and i < len(self.fcs) - 1:
                h = self.layer_norms[i](h)
            h = self.hidden_activation(h)

        action = self.last_fc(h)

        if self._squash:
            action = torch.tanh(action)

        info_dict = dict()

        return action, info_dict
コード例 #11
0
def experiment(variant):
    ptu.set_gpu_mode(variant['gpu'])

    env = NormalizedBoxEnv(gym.make(variant['env_name']))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']

    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[net_size, net_size],
    )

    replay_buffer = SimpleReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )
    variant['algo_params']['replay_buffer'] = replay_buffer

    # QF Plot
    # variant['algo_params']['epoch_plotter'] = None

    algorithm = Reinforce(
        env=env,
        # training_env=env,
        save_environment=False,
        policy=policy,
        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #12
0
def experiment(variant):

    # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS)

    # Set seeds
    np.random.seed(variant['seed'])
    ptu.set_gpu_mode(variant['gpu'], gpu_id=0)
    ptu.seed(variant['seed'])
    variant['env_params']['seed'] = variant['seed']

    env = NormalizedBoxEnv(
        CentauroTrayEnv(**variant['env_params']),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    obs_dim = env.obs_dim
    action_dim = env.action_dim

    n_unintentional = 2

    if variant['load_dir']:
        params_file = os.path.join(variant['log_dir'], 'params.pkl')
        data = joblib.load(params_file)
        start_epoch = data['epoch']
        i_qf = data['qf']
        i_qf2 = data['qf2']
        u_qf = data['u_qf']
        u_qf2 = data['u_qf2']
        i_vf = data['i_vf']
        u_vf = data['u_vf']
        policy = data['policy']
        env._obs_mean = data['obs_mean']
        env._obs_var = data['obs_var']
    else:
        start_epoch = 0
        net_size = variant['net_size']

        u_qf = NNMultiQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            n_qs=n_unintentional,
            hidden_activation=variant['hidden_activation'],
            # shared_hidden_sizes=[net_size, net_size],
            shared_hidden_sizes=[net_size],
            # shared_hidden_sizes=[],
            unshared_hidden_sizes=[net_size, net_size],
            hidden_w_init=variant['q_hidden_w_init'],
            output_w_init=variant['q_output_w_init'],
        )
        i_qf = NNQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=variant['hidden_activation'],
            hidden_sizes=[net_size, net_size],
            hidden_w_init=variant['q_hidden_w_init'],
            output_w_init=variant['q_output_w_init'],
        )

        if USE_Q2:
            u_qf2 = NNMultiQFunction(
                obs_dim=obs_dim,
                action_dim=action_dim,
                n_qs=n_unintentional,
                hidden_activation=variant['hidden_activation'],
                # shared_hidden_sizes=[net_size, net_size],
                shared_hidden_sizes=[net_size],
                # shared_hidden_sizes=[],
                unshared_hidden_sizes=[net_size, net_size],
                hidden_w_init=variant['q_hidden_w_init'],
                output_w_init=variant['q_output_w_init'],
            )
            i_qf2 = NNQFunction(
                obs_dim=obs_dim,
                action_dim=action_dim,
                hidden_sizes=[net_size, net_size],
                hidden_w_init=variant['q_hidden_w_init'],
                output_w_init=variant['q_output_w_init'],
            )
        else:
            u_qf2 = None
            i_qf2 = None

        if EXPLICIT_VF:
            u_vf = NNMultiVFunction(
                obs_dim=obs_dim,
                n_vs=n_unintentional,
                hidden_activation=variant['hidden_activation'],
                # shared_hidden_sizes=[net_size, net_size],
                shared_hidden_sizes=[net_size],
                # shared_hidden_sizes=[],
                unshared_hidden_sizes=[net_size, net_size],
                hidden_w_init=variant['q_hidden_w_init'],
                output_w_init=variant['q_output_w_init'],
            )
            i_vf = NNVFunction(
                obs_dim=obs_dim,
                hidden_sizes=[net_size, net_size],
                hidden_w_init=variant['q_hidden_w_init'],
                output_w_init=variant['q_output_w_init'],
            )
        else:
            u_vf = None
            i_vf = None

        policy = POLICY(
            obs_dim=obs_dim,
            action_dim=action_dim,
            n_policies=n_unintentional,
            hidden_activation=variant['hidden_activation'],
            # shared_hidden_sizes=[net_size, net_size],
            shared_hidden_sizes=[net_size],
            # shared_hidden_sizes=[],
            unshared_hidden_sizes=[net_size, net_size],
            unshared_mix_hidden_sizes=[net_size, net_size],
            stds=None,
            input_norm=variant['input_norm'],
            shared_layer_norm=variant['shared_layer_norm'],
            policies_layer_norm=variant['policies_layer_norm'],
            mixture_layer_norm=variant['mixture_layer_norm'],
            mixing_temperature=1.,
            softmax_weights=variant['softmax_weights'],
            hidden_w_init=variant['pol_hidden_w_init'],
            output_w_init=variant['pol_output_w_init'],
        )

        if INIT_AVG_MIXING:
            set_average_mixing(
                policy, n_unintentional, obs_dim,
                batch_size=50,
                total_iters=1000,
            )

    replay_buffer = MultiGoalReplayBuffer(
        max_replay_buffer_size=variant['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
        reward_vector_size=n_unintentional,
    )

    algorithm = HIUSAC(
        env=env,
        policy=policy,
        u_qf1=u_qf,
        replay_buffer=replay_buffer,
        batch_size=BATCH_SIZE,
        i_qf1=i_qf,
        u_qf2=u_qf2,
        i_qf2=i_qf2,
        u_vf=u_vf,
        i_vf=i_vf,
        eval_env=env,
        save_environment=False,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda(ptu.device)

    # algorithm.pretrain(10000)
    algorithm.train(start_epoch=start_epoch)

    return algorithm
コード例 #13
0
def experiment(variant):

    # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS)

    # Set seeds
    np.random.seed(variant['seed'])
    ptu.set_gpu_mode(variant['gpu'])
    ptu.seed(variant['seed'])
    variant['env_params']['seed'] = variant['seed']

    env = NormalizedBoxEnv(
        Pusher2D3DofGoalCompoEnv(**variant['env_params']),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    obs_dim = env.obs_dim
    action_dim = env.action_dim

    n_unintentional = 2

    if variant['load_dir']:
        params_file = os.path.join(variant['log_dir'], 'params.pkl')
        data = joblib.load(params_file)
        start_epoch = data['epoch']
        i_qf = data['qf']
        u_qf = data['u_qf']
        policy = data['policy']
        exploration_policy = data['exploration_policy']
        env._obs_mean = data['obs_mean']
        env._obs_var = data['obs_var']
    else:
        start_epoch = 0
        net_size = variant['net_size']

        u_qf = NNMultiQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            n_qs=n_unintentional,
            hidden_activation=variant['hidden_activation'],
            # shared_hidden_sizes=[net_size, net_size],
            shared_hidden_sizes=[net_size],
            # shared_hidden_sizes=[],
            unshared_hidden_sizes=[net_size, net_size],
            hidden_w_init=variant['q_hidden_w_init'],
            output_w_init=variant['q_output_w_init'],
        )
        i_qf = NNQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=variant['hidden_activation'],
            hidden_sizes=[net_size, net_size],
            hidden_w_init=variant['q_hidden_w_init'],
            output_w_init=variant['q_output_w_init'],
        )

        policy = POLICY(
            obs_dim=obs_dim,
            action_dim=action_dim,
            n_policies=n_unintentional,
            hidden_activation=variant['hidden_activation'],
            # shared_hidden_sizes=[net_size, net_size],
            shared_hidden_sizes=[net_size],
            # shared_hidden_sizes=[],
            unshared_hidden_sizes=[net_size, net_size],
            unshared_mix_hidden_sizes=[net_size, net_size],
            stds=None,
            input_norm=variant['input_norm'],
            shared_layer_norm=variant['shared_layer_norm'],
            policies_layer_norm=variant['policies_layer_norm'],
            mixture_layer_norm=variant['mixture_layer_norm'],
            mixing_temperature=1.,
            softmax_weights=variant['softmax_weights'],
            hidden_w_init=variant['pol_hidden_w_init'],
            output_w_init=variant['pol_output_w_init'],
        )

        if INIT_AVG_MIXING:
            set_average_mixing(
                policy,
                n_unintentional,
                obs_dim,
                batch_size=50,
                total_iters=1000,
            )

        es = OUStrategy(
            action_space=env.action_space,
            mu=0,
            theta=0.15,
            max_sigma=0.3,
            min_sigma=0.3,
            decay_period=100000,
        )
        exploration_policy = PolicyWrappedWithExplorationStrategy(
            exploration_strategy=es,
            policy=policy,
        )

    replay_buffer = MultiGoalReplayBuffer(
        max_replay_buffer_size=variant['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
        reward_vector_size=n_unintentional,
    )

    algorithm = HIUDDPG(env=env,
                        policy=policy,
                        explo_policy=exploration_policy,
                        u_qf=u_qf,
                        replay_buffer=replay_buffer,
                        batch_size=BATCH_SIZE,
                        i_qf=i_qf,
                        eval_env=env,
                        save_environment=False,
                        **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()

    # algorithm.pretrain(PATH_LENGTH*2)
    algorithm.train(start_epoch=start_epoch)

    return algorithm
コード例 #14
0
def experiment(variant):

    # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS)

    np.random.seed(SEED)

    ptu.set_gpu_mode(variant['gpu'])
    ptu.seed(SEED)

    env = NormalizedBoxEnv(
        CentauroTrayEnv(**variant['env_params']),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    if variant['log_dir']:
        params_file = os.path.join(variant['log_dir'], 'params.pkl')
        data = joblib.load(params_file)
        raise NotImplementedError
    else:
        start_epoch = 0
        net_size = variant['net_size']

        qf = NNQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_sizes=[net_size, net_size]
        )
        policy = TanhMlpPolicy(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_sizes=[net_size, net_size],
        )
        es = OUStrategy(
            action_space=env.action_space,
            mu=0,
            theta=0.15,
            max_sigma=0.3,
            min_sigma=0.3,
            decay_period=100000,
        )
        exploration_policy = PolicyWrappedWithExplorationStrategy(
            exploration_strategy=es,
            policy=policy,
        )

        # Clamp model parameters
        qf.clamp_all_params(min=-0.003, max=0.003)
        policy.clamp_all_params(min=-0.003, max=0.003)

    replay_buffer = SimpleReplayBuffer(
        max_size=variant['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = DDPG(
        explo_env=env,
        policy=policy,
        explo_policy=exploration_policy,
        qf=qf,
        replay_buffer=replay_buffer,
        batch_size=BATCH_SIZE,
        eval_env=env,
        save_environment=False,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    # algorithm.pretrain(PATH_LENGTH*2)
    algorithm.train(start_epoch=start_epoch)

    return algorithm
コード例 #15
0
def experiment(variant):
    exploration_pol_id = 1
    render_q = True

    variant['algo_params']['exploration_pol_id'] = exploration_pol_id
    save_q_path = '/home/desteban/logs/two_q_plots%d' % exploration_pol_id

    goal_positions = [(5, 5), (-5, 5)]

    q_fcn_positions = [(5, 5), (0, 0), (-5, 5)]

    n_demons = len(goal_positions)

    ptu._use_gpu = variant['gpu']

    env = NormalizedBoxEnv(
        MultiCompositionEnv(
            actuation_cost_coeff=30,
            distance_cost_coeff=1.0,
            goal_reward=10,
            init_sigma=0.1,
            goal_positions=goal_positions,
        ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    # qfs = [FlattenMlp(
    #        hidden_sizes=[net_size, net_size],
    #        input_size=obs_dim + action_dim,
    #        output_size=1) for _ in range(n_demons)]
    qfs = [
        NNQFunction(obs_dim=obs_dim,
                    action_dim=action_dim,
                    hidden_sizes=(net_size, net_size)) for _ in range(n_demons)
    ]
    if ptu.gpu_enabled():
        for qf in qfs:
            qf.cuda()

    policies = [
        StochasticPolicy(hidden_sizes=[net_size, net_size],
                         obs_dim=obs_dim,
                         action_dim=action_dim) for _ in range(n_demons)
    ]
    if ptu.gpu_enabled():
        for policy in policies:
            policy.cuda()

    replay_buffer = MultiEnvReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        env,
        reward_vector_size=n_demons,
    )
    variant['algo_params']['replay_buffer'] = replay_buffer

    # QF Plot
    plotter = QFPolicyPlotter(
        qf=qfs,
        policy=policies,
        obs_lst=q_fcn_positions,
        default_action=[np.nan, np.nan],
        n_samples=100,
        render=render_q,
        save_path=save_q_path,
    )
    variant['algo_params']['_epoch_plotter'] = plotter
    # variant['algo_params']['_epoch_plotter'] = None

    algorithm = IUSQL(env=env,
                      u_qfs=qfs,
                      u_policies=policies,
                      **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #16
0
def experiment(variant):
    render_q = variant['render_q']
    save_q_path = '/home/desteban/logs/goalcompo_q_plots'

    ptu.set_gpu_mode(variant['gpu'])

    env = NormalizedBoxEnv(
        Navigation2dGoalCompoEnv(**variant['env_params'])
    )

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    n_unintentional = 2

    net_size = variant['net_size']
    u_qfs = [NNQFunction(obs_dim=obs_dim,
                         action_dim=action_dim,
                         hidden_sizes=(net_size, net_size))
             for _ in range(n_unintentional)]
    # i_qf = AvgNNQFunction(obs_dim=obs_dim,
    i_qf = SumNNQFunction(obs_dim=obs_dim,
                          action_dim=action_dim,
                          q_functions=u_qfs)

    # _i_policy = TanhGaussianPolicy(
    u_policies = [StochasticPolicy(
                hidden_sizes=[net_size, net_size],
                obs_dim=obs_dim,
                action_dim=action_dim,
                ) for _ in range(n_unintentional)]
    i_policy = StochasticPolicy(
                hidden_sizes=[net_size, net_size],
                obs_dim=obs_dim,
                action_dim=action_dim,)

    replay_buffer = MultiGoalReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        np.prod(env.observation_space.shape),
        np.prod(env.action_space.shape),
        n_unintentional
    )
    variant['algo_params']['replay_buffer'] = replay_buffer

    # QF Plot
    goal_pos = expt_variant['env_params']['goal_position']
    q_fcn_positions = [
        (goal_pos[0], 0.0),
        (0.0, 0.0),
        (0.0, goal_pos[1])
    ]
    plotter = QFPolicyPlotter(
        i_qf=i_qf,
        i_policy=i_policy,
        u_qfs=u_qfs,
        u_policies=u_policies,
        obs_lst=q_fcn_positions,
        default_action=[np.nan, np.nan],
        n_samples=100,
        render=render_q,
        save_path=save_q_path,
    )
    variant['algo_params']['_epoch_plotter'] = plotter
    # variant['algo_params']['_epoch_plotter'] = None

    algorithm = IUSQL(
        env=env,
        training_env=env,
        save_environment=False,
        u_qfs=u_qfs,
        u_policies=u_policies,
        i_policy=i_policy,
        i_qf=i_qf,
        algo_interface='torch',
        min_buffer_size=variant['algo_params']['batch_size'],
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #17
0
def experiment(variant):

    # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS)

    np.random.seed(SEED)

    ptu.set_gpu_mode(variant['gpu'])
    ptu.seed(SEED)

    goal = variant['env_params'].get('goal')
    variant['env_params']['goal_poses'] = \
        [goal, (goal[0], 'any'), ('any', goal[1])]
    variant['env_params'].pop('goal')

    env = NormalizedBoxEnv(
        Pusher2D3DofGoalCompoEnv(**variant['env_params']),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    if variant['log_dir']:
        params_file = os.path.join(variant['log_dir'], 'params.pkl')
        data = joblib.load(params_file)
        start_epoch = data['epoch']
        qf = data['qf']
        policy = data['policy']
        env._obs_mean = data['obs_mean']
        env._obs_var = data['obs_var']
    else:
        start_epoch = 0
        net_size = variant['net_size']

        qf = NNQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_sizes=[net_size, net_size]
        )
        policy = POLICY(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_sizes=[net_size, net_size],
        )

        # Clamp model parameters
        qf.clamp_all_params(min=-0.003, max=0.003)
        policy.clamp_all_params(min=-0.003, max=0.003)

    replay_buffer = SimpleReplayBuffer(
        max_replay_buffer_size=variant['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = PPO(
        env=env,
        policy=policy,
        qf=qf,
        # replay_buffer=replay_buffer,
        # batch_size=BATCH_SIZE,
        eval_env=env,
        save_environment=False,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda()
    # algorithm.pretrain(PATH_LENGTH*2)
    algorithm.train(start_epoch=start_epoch)

    return algorithm
コード例 #18
0
env = create_environment()
#
# for cc in range(env.n_init_conds):
#     env.reset(condition=cc)
#     print(cc)
#     input('wuuu')

cost_fcn = create_cost_fcn(env)

local_policies, global_policy = create_policies(env)

mdgps_algo = create_algo(env, local_policies, global_policy, cost_fcn)

# if ptu.gpu_enabled():
#     mdgps_algo.cuda()
if ptu.gpu_enabled():
    global_policy.cuda()

start_epoch = 0
mdgps_algo.train(start_epoch=start_epoch)

# action_dim = env.action_dim
# obs_dim = env.obs_dim
# state_dim = env.state_dim
#
# print(action_dim, obs_dim, state_dim)
#
# fake_sample = dict(
#     actions=np.random.rand(10, action_dim),
#     observations=np.random.rand(10, obs_dim)
# )
コード例 #19
0
def experiment(variant):

    # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS)

    # Set seeds
    np.random.seed(variant['seed'])
    ptu.set_gpu_mode(variant['gpu'], gpu_id=0)
    ptu.seed(variant['seed'])
    variant['env_params']['seed'] = variant['seed']

    env = NormalizedBoxEnv(
        Reacher2D3DofGoalCompoEnv(**variant['env_params']),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    obs_dim = env.obs_dim
    action_dim = env.action_dim

    if variant['load_dir']:
        params_file = os.path.join(variant['log_dir'], 'params.pkl')
        data = joblib.load(params_file)
        start_epoch = data['epoch']
        raise NotImplementedError
    else:
        start_epoch = 0
        net_size = variant['net_size']

        qf = NNQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=variant['hidden_activation'],
            hidden_sizes=[net_size, net_size, net_size],
            hidden_w_init=variant['q_hidden_w_init'],
            output_w_init=variant['q_output_w_init'],
        )

        policy = POLICY(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=variant['hidden_activation'],
            hidden_sizes=[net_size, net_size, net_size],
            hidden_w_init=variant['pol_hidden_w_init'],
            output_w_init=variant['pol_output_w_init'],
        )
        es = OUStrategy(
            action_space=env.action_space,
            mu=0,
            theta=0.15,
            max_sigma=0.3,
            min_sigma=0.3,
            decay_period=100000,
        )
        exploration_policy = PolicyWrappedWithExplorationStrategy(
            exploration_strategy=es,
            policy=policy,
        )

    replay_buffer = SimpleReplayBuffer(
        max_size=variant['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = DDPG(
        explo_env=env,
        policy=policy,
        explo_policy=exploration_policy,
        qf=qf,
        replay_buffer=replay_buffer,
        batch_size=BATCH_SIZE,
        eval_env=env,
        save_environment=False,
        **variant['algo_params']
    )
    if ptu.gpu_enabled():
        algorithm.cuda(ptu.device)

    algorithm.pretrain(variant['steps_pretrain'])
    algorithm.train(start_epoch=start_epoch)

    return algorithm
コード例 #20
0
def experiment(variant):
    exploration_pol_id = 1

    variant['algo_params']['exploration_pol_id'] = exploration_pol_id

    n_demons = 2

    ptu._use_gpu = variant['gpu']

    env = NormalizedBoxEnv(
        Reacher2D3DofObstacleEnv(is_render=False,
                                 obs_with_img=False,
                                 rdn_tgt_pose=True,
                                 sim_timestep=0.001,
                                 frame_skip=10,
                                 obs_distances=True,
                                 tgt_cost_weight=1.0,
                                 obst_cost_weight=3.0,
                                 ctrl_cost_weight=1.0e-2,
                                 safe_radius=0.15,
                                 inside_cost=1,
                                 outside_cost=0))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    # qfs = [FlattenMlp(
    #        hidden_sizes=[net_size, net_size],
    #        input_size=obs_dim + action_dim,
    #        output_size=1) for _ in range(n_demons)]
    qfs = [
        NNQFunction(obs_dim=obs_dim,
                    action_dim=action_dim,
                    hidden_sizes=(net_size, net_size)) for _ in range(n_demons)
    ]
    if ptu.gpu_enabled():
        for qf in qfs:
            qf.cuda()

    policies = [
        StochasticPolicy(hidden_sizes=[net_size, net_size],
                         obs_dim=obs_dim,
                         action_dim=action_dim) for _ in range(n_demons)
    ]
    if ptu.gpu_enabled():
        for policy in policies:
            policy.cuda()

    replay_buffer = MultiEnvReplayBuffer(
        variant['algo_params']['replay_buffer_size'],
        env,
        reward_vector_size=n_demons,
    )
    variant['algo_params']['replay_buffer'] = replay_buffer

    # QF Plot
    variant['algo_params']['_epoch_plotter'] = None

    algorithm = IUSQL(env=env,
                      u_qfs=qfs,
                      u_policies=policies,
                      **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()
    algorithm.train()

    return algorithm
コード例 #21
0
def experiment(variant):

    # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS)

    # Set seeds
    np.random.seed(variant['seed'])
    ptu.set_gpu_mode(variant['gpu'], gpu_id=0)
    ptu.seed(variant['seed'])
    variant['env_params']['seed'] = variant['seed']

    env = NormalizedBoxEnv(
        Pusher2D3DofGoalCompoEnv(**variant['env_params']),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    obs_dim = env.obs_dim
    action_dim = env.action_dim

    if variant['load_dir']:
        params_file = os.path.join(variant['log_dir'], 'params.pkl')
        data = joblib.load(params_file)
        start_epoch = data['epoch']
        qf = data['qf']
        qf2 = data['qf2']
        vf = data['vf']
        policy = data['policy']
        env._obs_mean = data['obs_mean']
        env._obs_var = data['obs_var']
    else:
        start_epoch = 0
        net_size = variant['net_size']

        qf = NNQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=expt_params['hidden_activation'],
            hidden_sizes=[net_size, net_size],
        )
        if USE_Q2:
            qf2 = NNQFunction(
                obs_dim=obs_dim,
                action_dim=action_dim,
                hidden_activation=expt_params['hidden_activation'],
                hidden_sizes=[net_size, net_size],
            )
        else:
            qf2 = None
        vf = NNVFunction(
            obs_dim=obs_dim,
            hidden_activation=expt_params['hidden_activation'],
            hidden_sizes=[net_size, net_size],
        )
        policy = POLICY(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=expt_params['hidden_activation'],
            hidden_sizes=[net_size, net_size],
        )

        # # Clamp model parameters
        # qf.clamp_all_params(min=-0.003, max=0.003)
        # vf.clamp_all_params(min=-0.003, max=0.003)
        # policy.clamp_all_params(min=-0.003, max=0.003)
        # if USE_Q2:
        #     qf2.clamp_all_params(min=-0.003, max=0.003)

    replay_buffer = SimpleReplayBuffer(
        max_size=variant['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = SAC(explo_env=env,
                    policy=policy,
                    qf=qf,
                    vf=vf,
                    replay_buffer=replay_buffer,
                    batch_size=BATCH_SIZE,
                    qf2=qf2,
                    eval_env=env,
                    save_environment=False,
                    **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda()

    algorithm.pretrain(variant['steps_pretrain'])
    algorithm.train(start_epoch=start_epoch)

    return algorithm
コード例 #22
0
def experiment(variant):

    # os.environ['OMP_NUM_THREADS'] = str(NP_THREADS)

    # Set seeds
    np.random.seed(variant['seed'])
    ptu.set_gpu_mode(variant['gpu'], gpu_id=0)
    ptu.seed(variant['seed'])
    variant['env_params']['seed'] = variant['seed']

    env = NormalizedBoxEnv(
        Navigation2dGoalCompoEnv(**variant['env_params']),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )

    obs_dim = env.obs_dim
    action_dim = env.action_dim

    if variant['load_dir']:
        params_file = os.path.join(variant['log_dir'], 'params.pkl')
        data = joblib.load(params_file)
        start_epoch = data['epoch']
        qf = data['qf']
        qf2 = data['qf2']
        vf = data['vf']
        policy = data['policy']
        env._obs_mean = data['obs_mean']
        env._obs_var = data['obs_var']
    else:
        start_epoch = 0
        net_size = variant['net_size']

        qf = NNQFunction(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=variant['hidden_activation'],
            hidden_sizes=[net_size, net_size, net_size],
            hidden_w_init=variant['q_hidden_w_init'],
            output_w_init=variant['q_output_w_init'],
        )
        if USE_Q2:
            qf2 = NNQFunction(
                obs_dim=obs_dim,
                action_dim=action_dim,
                hidden_activation=variant['hidden_activation'],
                hidden_sizes=[net_size, net_size, net_size],
                hidden_w_init=variant['q_hidden_w_init'],
                output_w_init=variant['q_output_w_init'],
            )
        else:
            qf2 = None

        if EXPLICIT_VF:
            vf = NNVFunction(
                obs_dim=obs_dim,
                hidden_activation=variant['hidden_activation'],
                hidden_sizes=[net_size, net_size, net_size],
                hidden_w_init=variant['v_hidden_w_init'],
                output_w_init=variant['v_output_w_init'],
            )
        else:
            vf = None

        policy = POLICY(
            obs_dim=obs_dim,
            action_dim=action_dim,
            hidden_activation=variant['hidden_activation'],
            hidden_sizes=[net_size, net_size, net_size],
            hidden_w_init=variant['pol_hidden_w_init'],
            output_w_init=variant['pol_output_w_init'],
        )

    replay_buffer = SimpleReplayBuffer(
        max_size=variant['replay_buffer_size'],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    algorithm = SAC(explo_env=env,
                    policy=policy,
                    qf=qf,
                    qf2=qf2,
                    vf=vf,
                    replay_buffer=replay_buffer,
                    batch_size=BATCH_SIZE,
                    eval_env=env,
                    save_environment=False,
                    **variant['algo_params'])
    if ptu.gpu_enabled():
        algorithm.cuda(ptu.device)

    algorithm.pretrain(variant['steps_pretrain'])
    algorithm.train(start_epoch=start_epoch)

    return algorithm
コード例 #23
0
def experiment(variant):
    render_q = True
    save_q_path = '/home/desteban/logs/two_q_plots'
    goal_positions = [(5, 5), (-5, 5)]

    q_fcn_positions = [(5, 5), (0, 0), (-5, 5)]

    n_demons = len(goal_positions)

    ptu._use_gpu = variant['gpu']

    env = NormalizedBoxEnv(
        MultiCompositionEnv(
            actuation_cost_coeff=30,
            distance_cost_coeff=1.0,
            goal_reward=10,
            init_sigma=0.1,
            goal_positions=goal_positions,
        ))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    net_size = variant['net_size']
    qf = NNQFunction(obs_dim=obs_dim,
                     action_dim=action_dim,
                     hidden_sizes=(net_size, net_size))

    # _i_policy = TanhGaussianPolicy(
    policy = StochasticPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # QF Plot
    plotter = QFPolicyPlotter(
        qf=qf,
        policy=policy,
        obs_lst=q_fcn_positions,
        default_action=[np.nan, np.nan],
        n_samples=100,
        render=render_q,
        save_path=save_q_path,
    )
    variant['algo_params']['_epoch_plotter'] = plotter
    # variant['algo_params']['_epoch_plotter'] = None

    algorithm = SQL(env=env, qf=qf, policy=policy, **variant['algo_params'])
    for net in algorithm.torch_models:
        print(net)
        for pp in net.parameters():
            print(pp.is_cuda)

    print('-----------')
    input('IIIII')
    if ptu.gpu_enabled():
        algorithm.cuda()

    for net in algorithm.torch_models:
        print(net)
        for pp in net.parameters():
            print(pp.is_cuda)
    input('YUIPION')

    algorithm.train()

    return algorithm