Ejemplo n.º 1
0
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        lr_schedule: LearningRateSchedule,
        net_arch: Optional[List[int]] = None,
        activation_fn: Type[nn.Module] = nn.ReLU,
        features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN,
        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
        normalize_images: bool = True,
        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
    ):
        super(CnnPolicy, self).__init__(
            observation_space,
            action_space,
            lr_schedule,
            net_arch,
            activation_fn,
            features_extractor_class,
            features_extractor_kwargs,
            normalize_images,
            optimizer_class,
            optimizer_kwargs,
        )


register_policy("MlpPolicy", MlpPolicy)
register_policy("CnnPolicy", CnnPolicy)
Ejemplo n.º 2
0
        lr_schedule: Schedule,
        net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None,
        activation_fn: Type[nn.Module] = nn.ReLU,
        features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor,
        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
        normalize_images: bool = True,
        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
        optimizer_kwargs: Optional[Dict[str, Any]] = None,
        n_critics: int = 2,
        share_features_extractor: bool = True,
    ):
        super(MultiInputPolicy, self).__init__(
            observation_space,
            action_space,
            lr_schedule,
            net_arch,
            activation_fn,
            features_extractor_class,
            features_extractor_kwargs,
            normalize_images,
            optimizer_class,
            optimizer_kwargs,
            n_critics,
            share_features_extractor,
        )


register_policy("MlpPolicy", MlpPolicy)
register_policy("CnnPolicy", CnnPolicy)
register_policy("MultiInputPolicy", MultiInputPolicy)
Ejemplo n.º 3
0
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, _check_length_scale
import timeit
import copy
from multiprocessing import Queue, Process, Manager

import os

from MetaBayesOpt.AquisitionFunctions import MLPAF
from stable_baselines3.common.policies import register_policy

MMetric = None

__ACQUISITION__ = 'PI'

register_policy('MLPAF', MLPAF)

mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
maximum_search_points = 2**20  #2**int(log2(mem_bytes/512 - 1))

__measure_time__ = False
__USE_CPP_BACKEND__ = None
__REGRESSOR_LIB__ = "SKLearn"

if __REGRESSOR_LIB__ == "GPY":
    try:
        import GPy
    except:
        print("importing GPY failed...")
        __REGRESSOR_LIB__ = "SKLEARN"
Ejemplo n.º 4
0
        # Make batch out of tensor (consisting of n-stacked octrees)
        octree_batch = preprocess_stacked_octree_batch(observation,
                                                       self.device)

        with th.no_grad():
            actions = self._predict(octree_batch, deterministic=deterministic)
        # Convert to numpy
        actions = actions.cpu().numpy()

        if isinstance(self.action_space, gym.spaces.Box):
            if self.squash_output:
                # Rescale to proper domain when using squashing
                actions = self.unscale_action(actions)
            else:
                # Actions could be on arbitrary scale, so clip the actions to avoid
                # out of bound error (e.g. if sampling from a Gaussian distribution)
                actions = np.clip(actions, self.action_space.low,
                                  self.action_space.high)

        if not vectorized_env:
            if state is not None:
                raise ValueError(
                    "Error: The environment must be vectorized when using recurrent policies."
                )
            actions = actions[0]

        return actions, state


register_policy("OctreeCnnPolicy", OctreeCnnPolicy)
Ejemplo n.º 5
0
# This file is here just to define MlpPolicy/CnnPolicy
# that work for A2C
from stable_baselines3.common.policies import (
    ActorCriticCnnPolicy,
    ActorCriticPolicy,
    MultiInputActorCriticPolicy,
    register_policy,
)

MlpPolicy = ActorCriticPolicy
CnnPolicy = ActorCriticCnnPolicy
MultiInputPolicy = MultiInputActorCriticPolicy

register_policy("MlpPolicy", ActorCriticPolicy)
register_policy("CnnPolicy", ActorCriticCnnPolicy)
register_policy("MultiInputPolicy", MultiInputPolicy)
Ejemplo n.º 6
0
# This file is here just to define MlpPolicy/CnnPolicy
# that work for A2C
from stable_baselines3.common.policies import ActorCriticCnnPolicy, ActorCriticPolicy, register_policy

MlpPolicy = ActorCriticPolicy
CnnPolicy = ActorCriticCnnPolicy

register_policy("MlpPolicy", ActorCriticPolicy)
register_policy("CnnPolicy", ActorCriticCnnPolicy)
Ejemplo n.º 7
0
                 normalize_images: bool = True,
                 optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
                 optimizer_kwargs: Optional[Dict[str, Any]] = None):
        super(DQNPolicyAverageRewardAdjusted,
              self).__init__(observation_space, action_space, lr_schedule,
                             net_arch, device, activation_fn,
                             features_extractor_class,
                             features_extractor_kwargs, normalize_images,
                             optimizer_class, optimizer_kwargs)

    def make_q_net(self) -> QNetworkAverageRewardAdjusted:
        # Make sure we always have separate networks for feature extractors etc
        features_extractor = self.features_extractor_class(
            self.observation_space, **self.features_extractor_kwargs)
        features_dim = features_extractor.features_dim
        return QNetworkAverageRewardAdjusted(
            features_extractor=features_extractor,
            features_dim=features_dim,
            **self.net_args).to(self.device)

    def _predict(self,
                 obs: th.Tensor,
                 deterministic: bool = True) -> th.Tensor:
        action, q_values = self.q_net._predict(obs,
                                               deterministic=deterministic)
        return action


register_policy("MlpAverageRewardAdjustedPolicy",
                DQNPolicyAverageRewardAdjusted)
Ejemplo n.º 8
0
import gym
import torch as th
from torch import nn

from stable_baselines3.common.policies import BasePolicy, register_policy
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor, NatureCNN, create_mlp
from stable_baselines3.dqn.policies import DQNPolicy, QNetwork


class SoftQNetwork(QNetwork):
    def _predict(self, observation: th.Tensor, deterministic: bool = True) -> th.Tensor:
        q_values = self.forward(observation)
        probs = nn.functional.softmax(q_values * 10, dim=1)
        m = th.distributions.Categorical(probs)
        action = m.sample().reshape(-1) 
        return action


class SQLPolicy(DQNPolicy):
    def make_q_net(self) -> SoftQNetwork:
        # Make sure we always have separate networks for features extractors etc
        net_args = self._update_features_extractor(
            self.net_args, features_extractor=None)
        return SoftQNetwork(**net_args).to(self.device)


SoftMlpPolicy = SQLPolicy

register_policy("SoftMlpPolicy", SoftMlpPolicy)