def __init__( self, # state_net: StateNet, head_net: ValueHead, ): super().__init__() # self.state_net = state_net self.observation_net = nn.Sequential( nn.Conv2d(12, 16, kernel_size=3), nn.Dropout2d(p=0.1), nn.LeakyReLU(), nn.Conv2d(16, 32, kernel_size=3, groups=4), nn.Dropout2d(p=0.1), nn.LeakyReLU(), nn.Conv2d(32, 64, kernel_size=3, groups=4), # Flatten() ) self.observation_net.apply( utils.create_optimal_inner_init(nn.LeakyReLU)) self.aggregation_net = nn.Sequential( Flatten(), nn.Linear(64, 64), nn.LayerNorm(64), nn.Dropout(p=0.1), nn.LeakyReLU(), ) self.aggregation_net.apply( utils.create_optimal_inner_init(nn.LeakyReLU)) self.head_net = head_net
def __init__(self, action_size, layer_fn, activation_fn=nn.ReLU, bias=True, parity="odd"): """ Conditional affine coupling layer used in Real NVP Bijector. Original paper: https://arxiv.org/abs/1605.08803 Adaptation to RL: https://arxiv.org/abs/1804.02808 Important notes --------------- 1. State embeddings are supposed to have size (action_size * 2). 2. Scale and translation networks used in the Real NVP Bijector both have one hidden layer of (action_size) (activation_fn) units. 3. Parity ("odd" or "even") determines which part of the input is being copied and which is being transformed. """ super().__init__() layer_fn = MODULES.get_if_str(layer_fn) activation_fn = MODULES.get_if_str(activation_fn) self.parity = parity if self.parity == "odd": self.copy_size = action_size // 2 else: self.copy_size = action_size - action_size // 2 self.scale_prenet = SequentialNet( hiddens=[action_size * 2 + self.copy_size, action_size], layer_fn=layer_fn, activation_fn=activation_fn, norm_fn=None, bias=bias) self.scale_net = SequentialNet( hiddens=[action_size, action_size - self.copy_size], layer_fn=layer_fn, activation_fn=None, norm_fn=None, bias=True) self.translation_prenet = SequentialNet( hiddens=[action_size * 2 + self.copy_size, action_size], layer_fn=layer_fn, activation_fn=activation_fn, norm_fn=None, bias=bias) self.translation_net = SequentialNet( hiddens=[action_size, action_size - self.copy_size], layer_fn=layer_fn, activation_fn=None, norm_fn=None, bias=True) inner_init = create_optimal_inner_init(nonlinearity=activation_fn) self.scale_prenet.apply(inner_init) self.scale_net.apply(outer_init) self.translation_prenet.apply(inner_init) self.translation_net.apply(outer_init)
def _get_convolution_net(in_channels: int, history_len: int = 1, channels: List = None, kernel_sizes: List = None, strides: List = None, use_bias: bool = False, use_groups: bool = False, use_normalization: bool = False, use_dropout: bool = False, activation: str = "ReLU") -> nn.Module: channels = channels or [32, 64, 32] kernel_sizes = kernel_sizes or [8, 4, 3] strides = strides or [4, 2, 1] activation_fn = torch.nn.__dict__[activation] assert len(channels) == len(kernel_sizes) == len(strides) def _get_block(**conv_params): layers = [nn.Conv2d(**conv_params)] if use_normalization: layers.append(nn.InstanceNorm2d(conv_params["out_channels"])) if use_dropout: layers.append(nn.Dropout2d(p=0.1)) layers.append(activation_fn(inplace=True)) return layers channels.insert(0, history_len * in_channels) params = [] for i, (in_channels, out_channels) in enumerate(utils.pairwise(channels)): num_groups = 1 if use_groups: num_groups = history_len if i == 0 else 4 params.append({ "in_channels": in_channels, "out_channels": out_channels, "bias": use_bias, "kernel_size": kernel_sizes[i], "stride": strides[i], "groups": num_groups, }) layers = [] for block_params in params: layers.extend(_get_block(**block_params)) net = nn.Sequential(*layers) net.apply(utils.create_optimal_inner_init(activation_fn)) # input_shape: tuple = (3, 84, 84) # conv_input = torch.Tensor(torch.randn((1,) + input_shape)) # conv_output = net(conv_input) # torch.Size([1, 32, 7, 7]), 1568 # print(conv_output.shape, conv_output.nelement()) return net
def get_convolution_net(in_channels: int, history_len: int = 1, channels: List = None, kernel_sizes: List = None, strides: List = None, groups: List = None, use_bias: bool = False, normalization: str = None, dropout_rate: float = None, activation: str = "ReLU") -> nn.Module: channels = channels or [32, 64, 64] kernel_sizes = kernel_sizes or [8, 4, 3] strides = strides or [4, 2, 1] groups = groups or [1, 1, 1] activation_fn = nn.__dict__[activation] assert len(channels) == len(kernel_sizes) == len(strides) == len(groups) def _get_block(**conv_params): layers = [nn.Conv2d(**conv_params)] if normalization is not None: normalization_fn = MODULES.get_if_str(normalization) layers.append(normalization_fn(conv_params["out_channels"])) if dropout_rate is not None: layers.append(nn.Dropout2d(p=dropout_rate)) layers.append(activation_fn(inplace=True)) return layers channels.insert(0, history_len * in_channels) params = [] for i, (in_channels, out_channels) in enumerate(utils.pairwise(channels)): params.append({ "in_channels": in_channels, "out_channels": out_channels, "bias": use_bias, "kernel_size": kernel_sizes[i], "stride": strides[i], "groups": groups[i], }) layers = [] for block_params in params: layers.extend(_get_block(**block_params)) net = nn.Sequential(*layers) net.apply(utils.create_optimal_inner_init(activation_fn)) return net
def _get_ff_main_net(in_features: int, out_features: int, use_bias: bool = False, use_normalization: bool = False, use_dropout: bool = False, activation: str = "ReLU") -> nn.Module: activation_fn = torch.nn.__dict__[activation] layers = [nn.Linear(in_features, out_features, bias=use_bias)] if use_normalization: layers.append(nn.LayerNorm(out_features)) if use_dropout: layers.append(nn.Dropout(p=0.1)) layers.append(activation_fn(inplace=True)) net = nn.Sequential(*layers) net.apply(utils.create_optimal_inner_init(activation_fn)) return net
def _get_linear_net(in_features: int, history_len: int = 1, features: List = None, use_bias: bool = False, use_normalization: bool = False, use_dropout: bool = False, activation: str = "ReLU") -> nn.Module: features = features or [64, 128, 64] activation_fn = torch.nn.__dict__[activation] def _get_block(**linear_params): layers = [nn.Linear(**linear_params)] if use_normalization: layers.append(nn.LayerNorm(linear_params["out_features"])) if use_dropout: layers.append(nn.Dropout(p=0.1)) layers.append(activation_fn(inplace=True)) return layers features.insert(0, history_len * in_features) params = [] for i, (in_features, out_features) in enumerate(utils.pairwise(features)): params.append({ "in_features": in_features, "out_features": out_features, "bias": use_bias, }) layers = [] for block_params in params: layers.extend(_get_block(**block_params)) net = nn.Sequential(*layers) net.apply(utils.create_optimal_inner_init(activation_fn)) return net
def _get_observation_net(history_len: int = 1, conv1_size: int = 32, conv2_size: int = 64, conv3_size: int = 32, use_bias: bool = False, use_groups: bool = False, use_normalization: bool = False, use_dropout: bool = False, activation: str = "ReLU") -> nn.Module: activation_fn = torch.nn.__dict__[activation] def _get_block(**conv_params): layers = [nn.Conv2d(**conv_params)] if use_normalization: layers.append(nn.InstanceNorm2d(conv_params["out_channels"])) if use_dropout: layers.append(nn.Dropout2d(p=0.1)) layers.append(activation_fn(inplace=True)) return layers params = [ { "in_channels": history_len * 1, "out_channels": conv1_size, "bias": use_bias, "kernel_size": 8, "stride": 4, "groups": history_len if use_groups else 1, }, { "in_channels": conv1_size, "out_channels": conv2_size, "bias": use_bias, "kernel_size": 4, "stride": 2, "groups": 4 if use_groups else 1, }, { "in_channels": conv2_size, "out_channels": conv3_size, "bias": use_bias, "kernel_size": 3, "stride": 1, "groups": 4 if use_groups else 1, }, ] layers = [] for block_params in params: layers.extend(_get_block(**block_params)) net = nn.Sequential(*layers) net.apply(utils.create_optimal_inner_init(activation_fn)) # input_shape: tuple = (1, 84, 84) # conv_input = torch.Tensor(torch.randn((1,) + input_shape)) # conv_output = net(conv_input) # torch.Size([1, 32, 7, 7]), 1568 # print(conv_output.shape, conv_output.nelement()) return net
def __init__( self, hiddens, layer_fn: Union[str, Dict, List], norm_fn: Union[str, Dict, List] = None, dropout_fn: Union[str, Dict, List] = None, activation_fn: Union[str, Dict, List] = None, residual: Union[bool, str] = False, layer_order: List = None, ): super().__init__() assert len(hiddens) > 1, "No sequence found" # layer params layer_fn = _process_additional_params(layer_fn, hiddens[1:]) # normalization params norm_fn = _process_additional_params(norm_fn, hiddens[1:]) # dropout params dropout_fn = _process_additional_params(dropout_fn, hiddens[1:]) # activation params activation_fn = _process_additional_params(activation_fn, hiddens[1:]) if isinstance(residual, bool) and residual: residual = "hard" residual = _process_additional_params(residual, hiddens[1:]) layer_order = layer_order or ["layer", "norm", "drop", "act"] def _layer_fn(layer_fn, f_in, f_out, **kwargs): layer_fn = MODULES.get_if_str(layer_fn) layer_fn = layer_fn(f_in, f_out, **kwargs) return layer_fn def _normalization_fn(normalization_fn, f_in, f_out, **kwargs): normalization_fn = MODULES.get_if_str(normalization_fn) normalization_fn = \ normalization_fn(f_out, **kwargs) \ if normalization_fn is not None \ else None return normalization_fn def _dropout_fn(dropout_fn, f_in, f_out, **kwargs): dropout_fn = MODULES.get_if_str(dropout_fn) dropout_fn = dropout_fn(**kwargs) \ if dropout_fn is not None \ else None return dropout_fn def _activation_fn(activation_fn, f_in, f_out, **kwargs): activation_fn = MODULES.get_if_str(activation_fn) activation_fn = activation_fn(**kwargs) \ if activation_fn is not None \ else None return activation_fn name2fn = { "layer": _layer_fn, "norm": _normalization_fn, "drop": _dropout_fn, "act": _activation_fn, } name2params = { "layer": layer_fn, "norm": norm_fn, "drop": dropout_fn, "act": activation_fn, } net = [] for i, (f_in, f_out) in enumerate(utils.pairwise(hiddens)): block = [] for key in layer_order: sub_fn = name2fn[key] sub_params = deepcopy(name2params[key][i]) if isinstance(sub_params, Dict): sub_module = sub_params.pop("module") else: sub_module = sub_params sub_params = {} sub_block = sub_fn(sub_module, f_in, f_out, **sub_params) if sub_block is not None: block.append((f"{key}", sub_block)) block_ = OrderedDict(block) block = torch.nn.Sequential(block_) if block_.get("act", None) is not None: activation = block_["act"] activation_init = \ utils.create_optimal_inner_init(nonlinearity=activation) block.apply(activation_init) if residual == "hard" or (residual == "soft" and f_in == f_out): block = ResidualWrapper(net=block) net.append((f"block_{i}", block)) self.net = torch.nn.Sequential(OrderedDict(net))
def __init__(self, encoder, num_classes, feature_net_hiddens=None, emb_net_hiddens=None, activation_fn=torch.nn.ReLU, norm_fn=None, bias=True, dropout=None, consensus=None, kernel_size=1, feature_net_skip_connection=False, early_consensus=True): super().__init__() assert consensus is not None assert kernel_size in [1, 3, 5] consensus = consensus if isinstance(consensus, list) else [consensus] self.consensus = consensus self.encoder = encoder self.dropout = nn.Dropout(dropout) self.feature_net_skip_connection = feature_net_skip_connection self.early_consensus = early_consensus nonlinearity = registry.MODULES.get_if_str(activation_fn) inner_init = create_optimal_inner_init(nonlinearity=nonlinearity) kernel2pad = {1: 0, 3: 1, 5: 2} def layer_fn(in_features, out_features, bias=True): return nn.Conv1d(in_features, out_features, bias=bias, kernel_size=kernel_size, padding=kernel2pad[kernel_size]) if feature_net_hiddens is not None: self.feature_net = SequentialNet( hiddens=[encoder.out_features] + [feature_net_hiddens], layer_fn=layer_fn, norm_fn=norm_fn, activation_fn=activation_fn, ) self.feature_net.apply(inner_init) out_features = feature_net_hiddens else: # if no feature net, then no need of skip connection # (nothing to skip) assert not self.feature_net_skip_connection self.feature_net = lambda x: x out_features = encoder.out_features # Differences are starting here # Input channels to consensus function # (also to embedding net multiplied by len(consensus)) if self.feature_net_skip_connection: in_channels = out_features + encoder.out_features else: in_channels = out_features consensus_fn = OrderedDict() for key in sorted(consensus): if key == "attention": self.attn = nn.Sequential( nn.Conv1d(in_channels=in_channels, out_channels=1, kernel_size=kernel_size, padding=kernel2pad[kernel_size], bias=True), nn.Softmax(dim=1)) def self_attn_fn(x): x_a = x.transpose(1, 2) x_attn = (self.attn(x_a) * x_a) x_attn = x_attn.transpose(1, 2) x_attn = x_attn.mean(1, keepdim=True) return x_attn consensus_fn["attention"] = self_attn_fn elif key == "avg": consensus_fn[key] = lambda x: x.mean(1, keepdim=True) elif key == "max": consensus_fn[key] = lambda x: x.max(1, keepdim=True)[0] # Not optimized if too more understandable logic if self.early_consensus: out_features = emb_net_hiddens self.emb_net = SequentialNet( hiddens=[in_channels * len(consensus_fn), emb_net_hiddens], layer_fn=nn.Linear, norm_fn=norm_fn, activation_fn=activation_fn, ) self.emb_net.apply(inner_init) else: if self.feature_net_skip_connection: out_features = out_features + self.encoder.out_features else: out_features = out_features self.head = nn.Linear(out_features, num_classes, bias=True) if 'attention' in consensus: self.attn.apply(outer_init) self.head.apply(outer_init) self.consensus_fn = consensus_fn