def __init__(self, in_channels, out_channels, *, norm_layer, stride=1): super().__init__() # See note in ResidualBlock for the reason behind bias=True self.convnormrelu1 = Conv2dNormActivation(in_channels, out_channels // 4, norm_layer=norm_layer, kernel_size=1, bias=True) self.convnormrelu2 = Conv2dNormActivation(out_channels // 4, out_channels // 4, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True) self.convnormrelu3 = Conv2dNormActivation(out_channels // 4, out_channels, norm_layer=norm_layer, kernel_size=1, bias=True) self.relu = nn.ReLU(inplace=True) if stride == 1: self.downsample = nn.Identity() else: self.downsample = Conv2dNormActivation( in_channels, out_channels, norm_layer=norm_layer, kernel_size=1, stride=stride, bias=True, activation_layer=None, )
def __init__(self, in_channels, out_channels, *, norm_layer, stride=1): super().__init__() # Note regarding bias=True: # Usually we can pass bias=False in conv layers followed by a norm layer. # But in the RAFT training reference, the BatchNorm2d layers are only activated for the first dataset, # and frozen for the rest of the training process (i.e. set as eval()). The bias term is thus still useful # for the rest of the datasets. Technically, we could remove the bias for other norm layers like Instance norm # because these aren't frozen, but we don't bother (also, we woudn't be able to load the original weights). self.convnormrelu1 = Conv2dNormActivation(in_channels, out_channels, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True) self.convnormrelu2 = Conv2dNormActivation(out_channels, out_channels, norm_layer=norm_layer, kernel_size=3, bias=True) if stride == 1: self.downsample = nn.Identity() else: self.downsample = Conv2dNormActivation( in_channels, out_channels, norm_layer=norm_layer, kernel_size=1, stride=stride, bias=True, activation_layer=None, ) self.relu = nn.ReLU(inplace=True)
def __init__(self, in_channels: int, num_anchors: int, conv_depth=1) -> None: super().__init__() convs = [] for _ in range(conv_depth): convs.append( Conv2dNormActivation(in_channels, in_channels, kernel_size=3, norm_layer=None)) self.conv = nn.Sequential(*convs) self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=1, stride=1) for layer in self.modules(): if isinstance(layer, nn.Conv2d): torch.nn.init.normal_(layer.weight, std=0.01) # type: ignore[arg-type] if layer.bias is not None: torch.nn.init.constant_(layer.bias, 0) # type: ignore[arg-type]
def __init__(self, *, block=ResidualBlock, layers=(64, 64, 96, 128, 256), norm_layer=nn.BatchNorm2d): super().__init__() if len(layers) != 5: raise ValueError(f"The expected number of layers is 5, instead got {len(layers)}") # See note in ResidualBlock for the reason behind bias=True self.convnormrelu = Conv2dNormActivation( 3, layers[0], norm_layer=norm_layer, kernel_size=7, stride=2, bias=True ) self.layer1 = self._make_2_blocks(block, layers[0], layers[1], norm_layer=norm_layer, first_stride=1) self.layer2 = self._make_2_blocks(block, layers[1], layers[2], norm_layer=norm_layer, first_stride=2) self.layer3 = self._make_2_blocks(block, layers[2], layers[3], norm_layer=norm_layer, first_stride=2) self.conv = nn.Conv2d(layers[3], layers[4], kernel_size=1) for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu") elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d)): if m.weight is not None: nn.init.constant_(m.weight, 1) if m.bias is not None: nn.init.constant_(m.bias, 0)
def __init__(self, *, in_channels_corr, corr_layers=(256, 192), flow_layers=(128, 64), out_channels=128): super().__init__() if len(flow_layers) != 2: raise ValueError( f"The expected number of flow_layers is 2, instead got {len(flow_layers)}" ) if len(corr_layers) not in (1, 2): raise ValueError( f"The number of corr_layers should be 1 or 2, instead got {len(corr_layers)}" ) self.convcorr1 = Conv2dNormActivation(in_channels_corr, corr_layers[0], norm_layer=None, kernel_size=1) if len(corr_layers) == 2: self.convcorr2 = Conv2dNormActivation(corr_layers[0], corr_layers[1], norm_layer=None, kernel_size=3) else: self.convcorr2 = nn.Identity() self.convflow1 = Conv2dNormActivation(2, flow_layers[0], norm_layer=None, kernel_size=7) self.convflow2 = Conv2dNormActivation(flow_layers[0], flow_layers[1], norm_layer=None, kernel_size=3) # out_channels - 2 because we cat the flow (2 channels) at the end self.conv = Conv2dNormActivation(corr_layers[-1] + flow_layers[-1], out_channels - 2, norm_layer=None, kernel_size=3) self.out_channels = out_channels
def __init__(self, *, in_channels, hidden_size, multiplier=0.25): super().__init__() self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3) # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder # and we interpolate with all 9 surrounding neighbors. See paper and appendix B. self.conv = nn.Conv2d(hidden_size, 8 * 8 * 9, 1, padding=0) # In the original code, they use a factor of 0.25 to "downweight the gradients" of that branch. # See e.g. https://github.com/princeton-vl/RAFT/issues/119#issuecomment-953950419 # or https://github.com/princeton-vl/RAFT/issues/24. # It doesn't seem to affect epe significantly and can likely be set to 1. self.multiplier = multiplier
def __init__(self, *, in_channels_corr, corr_layers=(256, 192), flow_layers=(128, 64), out_channels=128): super().__init__() assert len(flow_layers) == 2 assert len(corr_layers) in (1, 2) self.convcorr1 = Conv2dNormActivation(in_channels_corr, corr_layers[0], norm_layer=None, kernel_size=1) if len(corr_layers) == 2: self.convcorr2 = Conv2dNormActivation(corr_layers[0], corr_layers[1], norm_layer=None, kernel_size=3) else: self.convcorr2 = nn.Identity() self.convflow1 = Conv2dNormActivation(2, flow_layers[0], norm_layer=None, kernel_size=7) self.convflow2 = Conv2dNormActivation(flow_layers[0], flow_layers[1], norm_layer=None, kernel_size=3) # out_channels - 2 because we cat the flow (2 channels) at the end self.conv = Conv2dNormActivation( corr_layers[-1] + flow_layers[-1], out_channels - 2, norm_layer=None, kernel_size=3 ) self.out_channels = out_channels
def __init__(self, *, in_channels: int, hidden_size: int, out_channels: int, multiplier: float = 0.25): super(raft.MaskPredictor, self).__init__() self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3) self.conv = nn.Conv2d(hidden_size, out_channels, kernel_size=1, padding=0) self.multiplier = multiplier