Ejemplo n.º 1
0
    def __init__(self, num_classes, dropout=None, **kwargs):
        super(MFNET_3D_MO_MM, self).__init__()

        self.num_classes = num_classes
        self.dropout_val = dropout
        # dict contains name of modality and num input channels, for flow this would be {'Flow':2}
        self.modalities = kwargs.get('modalities', {'RGB': 3})
        self.num_coords = kwargs.get('num_coords', 0)
        self.num_objects = kwargs.get('num_objects', 0)
        self.num_modalities = len(self.modalities)
        self.fusion_nets = self.num_modalities - 1

        k_sec = kwargs.get('k_sec', {2: 3, 3: 4, 4: 6, 5: 3})
        groups = kwargs.get('groups', 16)
        num_out = [16, 96, 192, 384, 768]
        num_mid = [96, 192, 384, 768]

        for mod_name, input_channels in self.modalities.items():
            # modality_block = self.instantiate_modality_block(mod_name, input_channels, groups, k_sec, num_mid, num_out)
            modality_block = Modality_Block(mod_name, input_channels, groups,
                                            k_sec, num_mid, num_out)
            self.add_module(mod_name, modality_block)
            tail = nn.Sequential(
                OrderedDict([('bn', nn.BatchNorm3d(num_out[-1])),
                             ('relu', nn.ReLU(inplace=True))]))
            self.add_module("{}_tail".format(mod_name), tail)

            gap = nn.AvgPool3d(kernel_size=(8, 7, 7),
                               stride=MFNET_3D_MO_MM.STRD['stable'])
            self.add_module("{}_gap".format(mod_name), gap)

        fusion_block = Fusion_Block(self.modalities.keys(), groups, k_sec,
                                    num_mid, num_out)
        self.add_module('Fusion', fusion_block)
        tail = nn.Sequential(
            OrderedDict([('bn', nn.BatchNorm3d(num_out[-1])),
                         ('relu', nn.ReLU(inplace=True))]))
        self.add_module('Fusion_tail', tail)
        gap = nn.AvgPool3d(kernel_size=(8, 7, 7),
                           stride=MFNET_3D_MO_MM.STRD['stable'])
        self.add_module('Fusion_gap', gap)
        if dropout:
            self.dropout = nn.Dropout(p=dropout)

        self.classifier_list = MultitaskClassifiers(
            (self.num_modalities + self.fusion_nets) * num_out[-1],
            num_classes)

        #############
        # Initialization
        xavier(net=self)
Ejemplo n.º 2
0
    def __init__(self, num_classes, dropout=None, **kwargs):
        super(MFNET_3D_SF, self).__init__()

        self.num_classes = num_classes
        self.num_coords = kwargs.get('num_coords', 0)
        self.num_objects = kwargs.get('num_objects', 0)
        self.sf_a = kwargs.get('sf_a', 6)  # 6 temporal dim
        self.sf_b = kwargs.get('sf_b', 6)  # 6 channel dim
        temporal_dim_slow = kwargs.get('num_frames', 4)  # could fit even 8
        temporal_dim_fast = temporal_dim_slow * self.sf_a
        spatial_dim = kwargs.get('spatial_size', 224)
        groups = kwargs.get('groups', 16)
        k_sec = kwargs.get('k_sec', {2: 3, 3: 4, 4: 6, 5: 3})
        num_mid = [96, 192, 384, 768]
        conv_num_out_slow = [16, 96, 192, 384, 768]
        conv_num_out_fast = [4, 16, 32, 64, 128]
        fusion_conv_channel_ratio = 4
        # for fast out channels are conv_num_out/b (=6) [2.67, 16, 32, 64, 128]->[3,16,32,64,128]

        # Slow intro conv
        self.slow_conv1 = nn.Sequential(
            OrderedDict([('conv',
                          nn.Conv3d(3,
                                    conv_num_out_slow[0],
                                    kernel_size=(1, 5, 5),
                                    padding=(0, 2, 2),
                                    stride=(1, 2, 2),
                                    bias=False)),
                         ('bn', nn.BatchNorm3d(conv_num_out_slow[0])),
                         ('relu', nn.ReLU(inplace=True))]))
        self.slow_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3),
                                         stride=(1, 2, 2),
                                         padding=(0, 1, 1))

        self.fast_conv1 = nn.Sequential(
            OrderedDict([('conv',
                          nn.Conv3d(3,
                                    conv_num_out_fast[0],
                                    kernel_size=(5, 5, 5),
                                    padding=(2, 2, 2),
                                    stride=(1, 2, 2),
                                    bias=False)),
                         ('bn', nn.BatchNorm3d(conv_num_out_fast[0])),
                         ('relu', nn.ReLU(inplace=True))]))
        self.fast_maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3),
                                         stride=(1, 2, 2),
                                         padding=(0, 1, 1))

        self.s1_fuse = FuseFastToSlow(
            conv_num_out_fast[0],
            fusion_conv_channel_ratio=fusion_conv_channel_ratio,
            fusion_kernel=5,
            alpha=self.sf_a)

        for block_id, (key, value) in enumerate(k_sec.items()):
            slow_temporal_kernel = 1 if key in [2, 3] else 3
            block_slow = nn.Sequential(
                OrderedDict([(
                    "B%02d" % i,
                    MF_UNIT_SF(
                        num_in=conv_num_out_slow[block_id] +
                        fusion_conv_channel_ratio * conv_num_out_fast[block_id]
                        if i == 1 else conv_num_out_slow[block_id + 1],
                        num_mid=num_mid[block_id],
                        num_out=conv_num_out_slow[block_id + 1],
                        stride=(1, 2, 2) if i == 1 and block_id != 0 else
                        (1, 1, 1),
                        g=groups,
                        first_block=(i == 1),
                        temporal_kernel=slow_temporal_kernel))
                             for i in range(1, value + 1)]))
            block_fast = nn.Sequential(
                OrderedDict([
                    ("B%02d" % i,
                     MF_UNIT_SF(num_in=conv_num_out_fast[block_id]
                                if i == 1 else conv_num_out_fast[block_id + 1],
                                num_mid=num_mid[block_id] // self.sf_b,
                                num_out=conv_num_out_fast[block_id + 1],
                                stride=(1, 2,
                                        2) if i == 1 and block_id != 0 else
                                (1, 1, 1),
                                g=1 if block_id == 0 else groups,
                                first_block=(i == 1),
                                temporal_kernel=3))
                    for i in range(1, value + 1)
                ]))

            fuse = FuseFastToSlow(
                conv_num_out_fast[block_id + 1],
                fusion_conv_channel_ratio=fusion_conv_channel_ratio,
                fusion_kernel=5,
                alpha=self.sf_a)

            self.add_module("slow_conv{}".format(key), block_slow)
            self.add_module("fast_conv{}".format(key), block_fast)
            self.add_module("s{}_fuse".format(key), fuse)

        # final
        self.tail_slow = nn.Sequential(
            OrderedDict([('bn',
                          nn.BatchNorm3d(conv_num_out_slow[-1] +
                                         fusion_conv_channel_ratio *
                                         conv_num_out_fast[-1])),
                         ('relu', nn.ReLU(inplace=True))]))
        self.tail_fast = nn.Sequential(
            OrderedDict([('bn', nn.BatchNorm3d(conv_num_out_fast[-1])),
                         ('relu', nn.ReLU(inplace=True))]))

        self.globalpool_slow = nn.AvgPool3d(kernel_size=(temporal_dim_slow,
                                                         spatial_dim // 32,
                                                         spatial_dim // 32),
                                            stride=(1, 1, 1))
        self.globalpool_fast = nn.AvgPool3d(kernel_size=(temporal_dim_fast,
                                                         spatial_dim // 32,
                                                         spatial_dim // 32),
                                            stride=(1, 1, 1))
        if dropout:
            self.dropout = nn.Dropout(p=dropout)

        self.classifier_list = MultitaskClassifiers(
            conv_num_out_slow[-1] +
            (fusion_conv_channel_ratio + 1) * conv_num_out_fast[-1],
            num_classes)

        #############
        # Initialization
        xavier(net=self)
Ejemplo n.º 3
0
    def __init__(self, num_classes, dropout=None, **kwargs):
        super(MFNET_3D_MO_COMB, self).__init__()
        # support for arbitrary number of output layers, but it is the user's job to make sure they make sense
        # (e.g. actions->actions and not actions->verbs,nouns etc.)
        map_charades = kwargs.get('map_charades')
        if map_charades:
            self.num_classes = num_classes[0:3]  # to remove all charego3 tasks
        else:
            self.num_classes = num_classes[0:4]  # to remove egtea VN tasks
        self.num_coords = kwargs.get('num_coords',
                                     0) - 2  # to remove one of the hand tasks
        self.num_objects = kwargs.get('num_objects', None)
        self.num_obj_cat = kwargs.get('num_obj_cat', None)
        self.one_object_layer = kwargs.get('one_object_layer', False)
        self.norm = kwargs.get('norm', 'BN')  # else 'GN' or 'IN'
        self.ensemble_eval = kwargs.get('ensemble_eval', False)
        self.t_dim_in = kwargs.get('num_frames', 16)
        self.s_dim_in = kwargs.get('spatial_size', 224)
        self.interpolate_coords = kwargs.get('interpolate_coordinates', 1)
        in_ch = kwargs.get('input_channels', 3)
        groups = 16
        # k_sec = {2: 3, 3: 4, 4: 6, 5: 3}
        k_sec = kwargs.get('k_sec', {2: 3, 3: 4, 4: 6, 5: 3})

        c1_out = 16
        c2_out = 96
        c3_out = 2 * c2_out
        c4_out = 2 * c3_out
        c5_out = 2 * c4_out

        conv1normlayer, tailnorm = get_norm_layers(self.norm, c1_out, c5_out)

        # conv1 - x224 (x16)
        self.conv1 = nn.Sequential(
            OrderedDict([('conv',
                          nn.Conv3d(in_ch,
                                    c1_out,
                                    kernel_size=(3, 5, 5),
                                    padding=(1, 2, 2),
                                    stride=(1, 2, 2),
                                    bias=False)), conv1normlayer,
                         ('relu', nn.ReLU(inplace=True))]))
        self.maxpool = nn.MaxPool3d(kernel_size=(1, 3, 3),
                                    stride=(1, 2, 2),
                                    padding=(0, 1, 1))

        # conv2 - x56 (x8)
        num_mid = 96

        self.conv2 = nn.Sequential(
            OrderedDict([("B%02d" % i,
                          MF_UNIT(num_in=c1_out if i == 1 else c2_out,
                                  num_mid=num_mid,
                                  num_out=c2_out,
                                  stride=(2, 1, 1) if i == 1 else (1, 1, 1),
                                  g=groups,
                                  first_block=(i == 1),
                                  norm=self.norm))
                         for i in range(1, k_sec[2] + 1)]))

        # conv3 - x28 (x8)
        num_mid *= 2
        self.conv3 = nn.Sequential(
            OrderedDict([("B%02d" % i,
                          MF_UNIT(num_in=c2_out if i == 1 else c3_out,
                                  num_mid=num_mid,
                                  num_out=c3_out,
                                  stride=(1, 2, 2) if i == 1 else (1, 1, 1),
                                  g=groups,
                                  first_block=(i == 1),
                                  norm=self.norm))
                         for i in range(1, k_sec[3] + 1)]))

        # conv4 - x14 (x8)
        num_mid *= 2
        self.conv4 = nn.Sequential(
            OrderedDict([("B%02d" % i,
                          MF_UNIT(num_in=c3_out if i == 1 else c4_out,
                                  num_mid=num_mid,
                                  num_out=c4_out,
                                  stride=(1, 2, 2) if i == 1 else (1, 1, 1),
                                  g=groups,
                                  first_block=(i == 1),
                                  norm=self.norm))
                         for i in range(1, k_sec[4] + 1)]))

        # conv5 - x7 (x8)
        num_mid *= 2
        self.conv5 = nn.Sequential(
            OrderedDict([("B%02d" % i,
                          MF_UNIT(num_in=c4_out if i == 1 else c5_out,
                                  num_mid=num_mid,
                                  num_out=c5_out,
                                  stride=(1, 2, 2) if i == 1 else (1, 1, 1),
                                  g=groups,
                                  first_block=(i == 1),
                                  norm=self.norm))
                         for i in range(1, k_sec[5] + 1)]))

        # create heatmaps
        if self.num_coords > 0:
            self.coord_layers = CoordRegressionLayer(c5_out, self.num_coords,
                                                     self.interpolate_coords)

        # final
        self.tail = nn.Sequential(
            OrderedDict([tailnorm, ('relu', nn.ReLU(inplace=True))]))

        self.globalpool = nn.Sequential()
        self.globalpool.add_module(
            'avg',
            nn.AvgPool3d(kernel_size=(self.t_dim_in // 2, self.s_dim_in // 32,
                                      self.s_dim_in // 32),
                         stride=(1, 1, 1)))

        if dropout:
            self.globalpool.add_module('dropout', nn.Dropout(p=dropout))

        self.classifier_list = MultitaskClassifiers(c5_out, self.num_classes)

        if self.num_objects:
            for ii, no in enumerate(
                    self.num_objects
            ):  # if there are more than one object presence layers, e.g. one per dataset
                object_presence_layer = ObjectPresenceLayer(
                    c5_out, no, one_layer=self.one_object_layer)
                self.add_module('object_presence_layer_{}'.format(ii),
                                object_presence_layer)
        if self.num_obj_cat:
            for ii, no in enumerate(self.num_obj_cat):
                object_presence_layer = ObjectPresenceLayer(
                    c5_out, no, one_layer=self.one_object_layer)
                self.add_module('objcat_presence_layer_{}'.format(ii),
                                object_presence_layer)
        #############
        # Initialization
        xavier(net=self)