def __init__(self, bottleneck2d, time_dim=1, time_padding=0, frame=0, num_segments=4): super(BasicBlock3d, self).__init__() spatial_stride = bottleneck2d.conv2.stride[0] self.frame = frame self.num_segments = num_segments self.conv1 = inflate.inflate_conv(bottleneck2d.conv1, time_dim=time_dim, time_padding=time_padding, center=False) self.bn1 = inflate.inflate_batch_norm(bottleneck2d.bn1) self.conv2 = inflate.inflate_conv(bottleneck2d.conv2, time_dim=time_dim, time_padding=time_padding, time_stride=1, center=False) self.bn2 = inflate.inflate_batch_norm(bottleneck2d.bn2) self.relu = torch.nn.ReLU(inplace=True) if bottleneck2d.downsample is not None: self.downsample = inflate_downsample(bottleneck2d.downsample, time_stride=spatial_stride) else: self.downsample = None self.stride = bottleneck2d.stride
def inflate_downsample(downsample2d, time_stride=1): downsample3d = torch.nn.Sequential( inflate.inflate_conv(downsample2d[0], time_dim=1, time_stride=1, center=False), inflate.inflate_batch_norm(downsample2d[1])) return downsample3d
def __init__(self, resnet2d, frame_nb=16, class_nb=1000, conv_class=False, num_segments=4, test_mode=False, fast_implementation=0): """ Args: conv_class: Whether to use convolutional layer as classifier to adapt to various number of frames """ super(I3ResNet, self).__init__() self.conv_class = conv_class self.num_segments = num_segments self.frame = frame_nb self.conv1 = inflate.inflate_conv(resnet2d.conv1, time_dim=1, time_padding=0, center=False) self.bn1 = inflate.inflate_batch_norm(resnet2d.bn1) self.relu = torch.nn.ReLU(inplace=True) self.maxpool = inflate.inflate_pool(resnet2d.maxpool, time_dim=1, time_padding=0, time_stride=1) self.layer1 = inflate_reslayer(resnet2d.layer1) self.layer2 = inflate_reslayer(resnet2d.layer2, num_R4D=2, in_channels=512, fast_implementation=fast_implementation, num_segments=num_segments) self.layer3 = inflate_reslayer(resnet2d.layer3, time_dim=3, time_padding=1) self.layer4 = inflate_reslayer(resnet2d.layer4, time_dim=3, time_padding=1) if conv_class: self.avgpool = inflate.inflate_pool(resnet2d.avgpool, time_dim=1) self.classifier = torch.nn.Conv3d(in_channels=2048, out_channels=class_nb, kernel_size=(1, 1, 1), bias=True) else: final_time_dim = int(math.ceil(frame_nb)) if test_mode: self.avgpool = nn.AvgPool3d((frame_nb, 8, 8)) else: self.avgpool = nn.AvgPool3d((frame_nb, 7, 7))
def __init__(self, resnet2d, frame_nb=16, class_nb=1000, conv_class=False, num_segments=1, gtsn=False): """ Args: conv_class: Whether to use convolutional layer as classifier to adapt to various number of frames """ super(I3ResNet_18_34, self).__init__() self.num_segments = num_segments self.conv_class = conv_class self.gtsn = gtsn self.conv1 = inflate.inflate_conv(resnet2d.conv1, time_dim=1, time_padding=0, center=False) self.bn1 = inflate.inflate_batch_norm(resnet2d.bn1) self.relu = torch.nn.ReLU(inplace=True) self.maxpool = inflate.inflate_pool(resnet2d.maxpool, time_dim=1, time_padding=0, time_stride=1) self.layer1 = inflate_reslayer_18_34(resnet2d.layer1) self.layer2 = inflate_reslayer_18_34(resnet2d.layer2, num_R4D=3, in_channels=128) self.layer3 = inflate_reslayer_18_34(resnet2d.layer3, time_dim=3, time_padding=1, num_R4D=3, in_channels=256) self.layer4 = inflate_reslayer_18_34(resnet2d.layer4, time_dim=3, time_padding=1) if conv_class: self.avgpool = inflate.inflate_pool(resnet2d.avgpool, time_dim=1) self.classifier = torch.nn.Conv3d(in_channels=2048, out_channels=class_nb, kernel_size=(1, 1, 1), bias=True) else: final_time_dim = int(math.ceil(frame_nb)) self.avgpool = inflate.inflate_pool(resnet2d.avgpool, time_dim=4)