def _construct_network(self, cfg): """ Builds a single pathway ResNet model. Args: cfg (CfgNode): model building configs, details are in the comments of the config file. """ assert cfg.MODEL.ARCH in _POOL1.keys() pool_size = _POOL1[cfg.MODEL.ARCH] assert len({len(pool_size), self.num_pathways}) == 1 assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys() (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH] num_groups = cfg.RESNET.NUM_GROUPS width_per_group = cfg.RESNET.WIDTH_PER_GROUP dim_inner = num_groups * width_per_group temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] self.s1 = stem_helper.VideoModelStem( dim_in=cfg.DATA.INPUT_CHANNEL_NUM, dim_out=[width_per_group], kernel=[temp_kernel[0][0] + [7, 7]], stride=[[1, 2, 2]], padding=[[temp_kernel[0][0][0] // 2, 3, 3]], ) self.s2 = resnet_helper.ResStage( dim_in=[width_per_group], dim_out=[width_per_group * 4], dim_inner=[dim_inner], temp_kernel_sizes=temp_kernel[1], stride=[1], num_blocks=[d2], num_groups=[num_groups], num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[0], nonlocal_inds=cfg.NONLOCAL.LOCATION[0], nonlocal_group=cfg.NONLOCAL.GROUP[0], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, stride_1x1=cfg.RESNET.STRIDE_1X1, inplace_relu=cfg.RESNET.INPLACE_RELU, ) for pathway in range(self.num_pathways): pool = nn.MaxPool3d( kernel_size=pool_size[pathway], stride=pool_size[pathway], padding=[0, 0, 0], ) self.add_module("pathway{}_pool".format(pathway), pool) self.s3 = resnet_helper.ResStage( dim_in=[width_per_group * 4], dim_out=[width_per_group * 8], dim_inner=[dim_inner * 2], temp_kernel_sizes=temp_kernel[2], stride=[2], num_blocks=[d3], num_groups=[num_groups], num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[1], nonlocal_inds=cfg.NONLOCAL.LOCATION[1], nonlocal_group=cfg.NONLOCAL.GROUP[1], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, stride_1x1=cfg.RESNET.STRIDE_1X1, inplace_relu=cfg.RESNET.INPLACE_RELU, ) self.s4 = resnet_helper.ResStage( dim_in=[width_per_group * 8], dim_out=[width_per_group * 16], dim_inner=[dim_inner * 4], temp_kernel_sizes=temp_kernel[3], stride=[2], num_blocks=[d4], num_groups=[num_groups], num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[2], nonlocal_inds=cfg.NONLOCAL.LOCATION[2], nonlocal_group=cfg.NONLOCAL.GROUP[2], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, stride_1x1=cfg.RESNET.STRIDE_1X1, inplace_relu=cfg.RESNET.INPLACE_RELU, ) self.s5 = resnet_helper.ResStage( dim_in=[width_per_group * 16], dim_out=[width_per_group * 32], dim_inner=[dim_inner * 8], temp_kernel_sizes=temp_kernel[4], stride=[2], num_blocks=[d5], num_groups=[num_groups], num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[3], nonlocal_inds=cfg.NONLOCAL.LOCATION[3], nonlocal_group=cfg.NONLOCAL.GROUP[3], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, stride_1x1=cfg.RESNET.STRIDE_1X1, inplace_relu=cfg.RESNET.INPLACE_RELU, ) self.head = head_helper.ResNetBasicHead( dim_in=[width_per_group * 32], num_classes=cfg.MODEL.NUM_CLASSES, pool_size=[[ cfg.DATA.NUM_FRAMES // pool_size[0][0], cfg.DATA.CROP_SIZE // 32 // pool_size[0][1], cfg.DATA.CROP_SIZE // 32 // pool_size[0][2], ]], dropout_rate=cfg.MODEL.DROPOUT_RATE, )
def _construct_network(self, cfg): """ Builds a SlowFast model. The first pathway is the Slow pathway and the second pathway is the Fast pathway. Args: cfg (CfgNode): model building configs, details are in the comments of the config file. """ assert cfg.MODEL.ARCH in _POOL1.keys() pool_size = _POOL1[cfg.MODEL.ARCH] assert len({len(pool_size), self.num_pathways}) == 1 assert cfg.RESNET.DEPTH in _MODEL_STAGE_DEPTH.keys() (d2, d3, d4, d5) = _MODEL_STAGE_DEPTH[cfg.RESNET.DEPTH] num_groups = cfg.RESNET.NUM_GROUPS width_per_group = cfg.RESNET.WIDTH_PER_GROUP dim_inner = num_groups * width_per_group out_dim_ratio = (cfg.SLOWFAST.BETA_INV // cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO) temp_kernel = _TEMPORAL_KERNEL_BASIS[cfg.MODEL.ARCH] self.s1 = stem_helper.VideoModelStem( dim_in=cfg.DATA.INPUT_CHANNEL_NUM, dim_out=[ width_per_group, width_per_group // cfg.SLOWFAST.BETA_INV ], kernel=[temp_kernel[0][0] + [7, 7], temp_kernel[0][1] + [7, 7]], stride=[[1, 2, 2]] * 2, padding=[ [temp_kernel[0][0][0] // 2, 3, 3], [temp_kernel[0][1][0] // 2, 3, 3], ], ) self.s1_fuse = FuseFastToSlow( width_per_group // cfg.SLOWFAST.BETA_INV, cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO, cfg.SLOWFAST.FUSION_KERNEL_SZ, cfg.SLOWFAST.ALPHA, ) self.s2 = resnet_helper.ResStage( dim_in=[ width_per_group + width_per_group // out_dim_ratio, width_per_group // cfg.SLOWFAST.BETA_INV, ], dim_out=[ width_per_group * 4, width_per_group * 4 // cfg.SLOWFAST.BETA_INV, ], dim_inner=[dim_inner, dim_inner // cfg.SLOWFAST.BETA_INV], temp_kernel_sizes=temp_kernel[1], stride=[1] * 2, num_blocks=[d2] * 2, num_groups=[num_groups] * 2, num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[0], nonlocal_inds=cfg.NONLOCAL.LOCATION[0], nonlocal_group=cfg.NONLOCAL.GROUP[0], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, ) self.s2_fuse = FuseFastToSlow( width_per_group * 4 // cfg.SLOWFAST.BETA_INV, cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO, cfg.SLOWFAST.FUSION_KERNEL_SZ, cfg.SLOWFAST.ALPHA, ) for pathway in range(self.num_pathways): pool = nn.MaxPool3d( kernel_size=pool_size[pathway], stride=pool_size[pathway], padding=[0, 0, 0], ) self.add_module("pathway{}_pool".format(pathway), pool) self.s3 = resnet_helper.ResStage( dim_in=[ width_per_group * 4 + width_per_group * 4 // out_dim_ratio, width_per_group * 4 // cfg.SLOWFAST.BETA_INV, ], dim_out=[ width_per_group * 8, width_per_group * 8 // cfg.SLOWFAST.BETA_INV, ], dim_inner=[dim_inner * 2, dim_inner * 2 // cfg.SLOWFAST.BETA_INV], temp_kernel_sizes=temp_kernel[2], stride=[2] * 2, num_blocks=[d3] * 2, num_groups=[num_groups] * 2, num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[1], nonlocal_inds=cfg.NONLOCAL.LOCATION[1], nonlocal_group=cfg.NONLOCAL.GROUP[1], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, ) self.s3_fuse = FuseFastToSlow( width_per_group * 8 // cfg.SLOWFAST.BETA_INV, cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO, cfg.SLOWFAST.FUSION_KERNEL_SZ, cfg.SLOWFAST.ALPHA, ) self.s4 = resnet_helper.ResStage( dim_in=[ width_per_group * 8 + width_per_group * 8 // out_dim_ratio, width_per_group * 8 // cfg.SLOWFAST.BETA_INV, ], dim_out=[ width_per_group * 16, width_per_group * 16 // cfg.SLOWFAST.BETA_INV, ], dim_inner=[dim_inner * 4, dim_inner * 4 // cfg.SLOWFAST.BETA_INV], temp_kernel_sizes=temp_kernel[3], stride=[2] * 2, num_blocks=[d4] * 2, num_groups=[num_groups] * 2, num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[2], nonlocal_inds=cfg.NONLOCAL.LOCATION[2], nonlocal_group=cfg.NONLOCAL.GROUP[2], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, ) self.s4_fuse = FuseFastToSlow( width_per_group * 16 // cfg.SLOWFAST.BETA_INV, cfg.SLOWFAST.FUSION_CONV_CHANNEL_RATIO, cfg.SLOWFAST.FUSION_KERNEL_SZ, cfg.SLOWFAST.ALPHA, ) self.s5 = resnet_helper.ResStage( dim_in=[ width_per_group * 16 + width_per_group * 16 // out_dim_ratio, width_per_group * 16 // cfg.SLOWFAST.BETA_INV, ], dim_out=[ width_per_group * 32, width_per_group * 32 // cfg.SLOWFAST.BETA_INV, ], dim_inner=[dim_inner * 8, dim_inner * 8 // cfg.SLOWFAST.BETA_INV], temp_kernel_sizes=temp_kernel[4], stride=[2] * 2, num_blocks=[d5] * 2, num_groups=[num_groups] * 2, num_block_temp_kernel=cfg.RESNET.NUM_BLOCK_TEMP_KERNEL[3], nonlocal_inds=cfg.NONLOCAL.LOCATION[3], nonlocal_group=cfg.NONLOCAL.GROUP[3], instantiation=cfg.NONLOCAL.INSTANTIATION, trans_func_name=cfg.RESNET.TRANS_FUNC, ) self.head = head_helper.ResNetBasicHead( dim_in=[ width_per_group * 32, width_per_group * 32 // cfg.SLOWFAST.BETA_INV, ], num_classes=cfg.MODEL.NUM_CLASSES, pool_size=[ [ cfg.DATA.NUM_FRAMES // cfg.SLOWFAST.ALPHA // pool_size[0][0], cfg.DATA.CROP_SIZE // 32 // pool_size[0][1], cfg.DATA.CROP_SIZE // 32 // pool_size[0][2], ], [ cfg.DATA.NUM_FRAMES // pool_size[1][0], cfg.DATA.CROP_SIZE // 32 // pool_size[1][1], cfg.DATA.CROP_SIZE // 32 // pool_size[1][2], ], ], dropout_rate=cfg.MODEL.DROPOUT_RATE, )