Beispiel #1
0
    def __init__(self,
                 vocab,
                 feature_dim=(1024, 14, 14),
                 stem_module_dim=128,
                 stem_use_resnet=False,
                 stem_resnet_fixed=False,
                 resnet_model_stage=3,
                 stem_num_layers=2,
                 stem_batchnorm=False,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_stride2_freq=0,
                 stem_padding=None,
                 fc_dims=(1024, ),
                 fc_use_batchnorm=False,
                 fc_dropout=0):
        super(CnnModel, self).__init__()
        self.stem = build_stem(stem_use_resnet,
                               stem_resnet_fixed,
                               feature_dim[0],
                               stem_module_dim,
                               resnet_model_stage=resnet_model_stage,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               stride2_freq=stem_stride2_freq,
                               padding=stem_padding)

        if stem_stride2_freq > 0:
            module_H = feature_dim[1] // (2**(stem_num_layers //
                                              stem_stride2_freq))
            module_W = feature_dim[2] // (2**(stem_num_layers //
                                              stem_stride2_freq))
        else:
            module_H = feature_dim[1]
            module_W = feature_dim[2]

        self.conv = nn.Conv2d(stem_module_dim,
                              stem_module_dim,
                              kernel_size=1,
                              padding=0)
        self.pool = nn.MaxPool2d(kernel_size=(module_H, module_W),
                                 stride=(module_H, module_W))

        self.classifier = build_classifier(module_C=stem_module_dim,
                                           module_H=None,
                                           module_W=None,
                                           num_answers=len(
                                               vocab['answer_token_to_idx']),
                                           fc_dims=fc_dims,
                                           proj_dim=None,
                                           downsample=None,
                                           with_batchnorm=fc_use_batchnorm,
                                           dropout=fc_dropout)
Beispiel #2
0
    def __init__(self,
                 vocab,
                 feature_dim,
                 stem_num_layers,
                 stem_kernel_size,
                 stem_stride,
                 stem_padding,
                 stem_batchnorm,
                 module_dim,
                 module_batchnorm,
                 verbose=True):
        super(ModuleNet, self).__init__()

        self.program_idx_to_token = vocab['program_idx_to_token']
        self.answer_to_idx = vocab['answer_idx_to_token']
        self.text_token_to_idx = vocab['text_token_to_idx']
        self.program_token_to_module_text = vocab[
            'program_token_to_module_text']
        self.name_to_module = {
            'and': And(),
            'answer': lambda x: x,
            'find': Find(module_dim, len(self.text_token_to_idx)),
            'transform': Transform(len(self.text_token_to_idx)),
        }
        self.name_to_num_inputs = {
            'and': 2,
            'answer': 1,
            'find': 1,
            'transform': 1,
        }

        input_C, input_H, input_W = feature_dim
        self.stem = build_stem(input_C,
                               module_dim,
                               num_layers=stem_num_layers,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               with_batchnorm=stem_batchnorm)

        self.classifier = Answer(len(self.answer_to_idx))

        if verbose:
            print('Here is my stem:')
            print(self.stem)
            print('Here is my classifier:')
            print(self.classifier)

        for name, module in self.name_to_module.items():
            if name != 'answer':
                self.add_module(name, module)

        self.save_module_outputs = False
Beispiel #3
0
    def __init__(self,
                 vocab,
                 feature_dim=[3, 64, 64],
                 stem_dim=128,
                 module_dim=128,
                 stem_num_layers=2,
                 stem_batchnorm=True,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_padding=None,
                 stem_feature_dim=24,
                 stem_subsample_layers=None,
                 classifier_fc_layers=(1024, ),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 rnn_hidden_dim=128,
                 **kwargs):
        super().__init__()

        # initialize stem
        self.stem = build_stem(feature_dim[0],
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               subsample_layers=stem_subsample_layers)
        tmp = self.stem(Variable(torch.zeros([1] + feature_dim)))
        _, F, H, W = tmp.size()

        # initialize classifier
        # TODO(mnoukhov): fix this for >1 layer RNN
        question_dim = rnn_hidden_dim
        image_dim = F * H * W
        num_answers = len(vocab['answer_idx_to_token'])
        self.classifier = build_classifier(image_dim + question_dim, 1, 1,
                                           num_answers, classifier_fc_layers,
                                           None, None, classifier_batchnorm,
                                           classifier_dropout)

        init_modules(self.modules())
Beispiel #4
0
    def __init__(self,
                 vocab,
                 feature_dim=(1024, 14, 14),
                 stem_num_layers=2,
                 stem_batchnorm=False,
                 module_dim=128,
                 module_residual=True,
                 module_batchnorm=False,
                 classifier_proj_dim=512,
                 classifier_downsample='maxpool2',
                 classifier_fc_layers=(1024, ),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 verbose=True):
        super(ModuleNet, self).__init__()

        self.stem = build_stem(feature_dim[0],
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm)
        if verbose:
            print('Here is my stem:')
            print(self.stem)

        num_answers = len(vocab['answer_idx_to_token'])
        module_H, module_W = feature_dim[1], feature_dim[2]
        self.classifier = build_classifier(module_dim,
                                           module_H,
                                           module_W,
                                           num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           classifier_downsample,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)
        if verbose:
            print('Here is my classifier:')
            print(self.classifier)
        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False

        self.function_modules = {}
        self.function_modules_num_inputs = {}
        self.vocab = vocab
        for fn_str in vocab['program_token_to_idx']:
            num_inputs = vr.programs.get_num_inputs(fn_str)
            self.function_modules_num_inputs[fn_str] = num_inputs
            if fn_str == 'scene' or num_inputs == 1:
                mod = ResidualBlock(module_dim,
                                    with_residual=module_residual,
                                    with_batchnorm=module_batchnorm)
            elif num_inputs == 2:
                mod = ConcatBlock(module_dim,
                                  with_residual=module_residual,
                                  with_batchnorm=module_batchnorm)
            self.add_module(fn_str, mod)
            self.function_modules[fn_str] = mod

        self.save_module_outputs = False
Beispiel #5
0
    def __init__(
            self,
            vocab,
            feature_dim=(1024, 14, 14),
            stem_num_layers=2,
            stem_batchnorm=False,
            stem_kernel_size=3,
            stem_subsample_layers=None,
            stem_stride=1,
            stem_padding=None,
            stem_dim=64,
            num_modules=4,
            module_num_layers=1,
            module_dim=128,
            module_residual=True,
            module_intermediate_batchnorm=False,
            module_batchnorm=False,
            module_batchnorm_affine=False,
            module_dropout=0,
            module_input_proj=1,
            module_kernel_size=3,
            classifier_proj_dim=512,
            classifier_downsample='maxpool2',
            classifier_fc_layers=(1024, ),
            classifier_batchnorm=False,
            classifier_dropout=0,
            condition_method='bn-film',
            condition_pattern=[],
            use_gamma=True,
            use_beta=True,
            use_coords=1,
            debug_every=float('inf'),
            print_verbose_every=float('inf'),
            verbose=True,
    ):
        super(FiLMedNet, self).__init__()

        num_answers = len(vocab['answer_idx_to_token'])

        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False

        self.num_modules = num_modules
        self.module_num_layers = module_num_layers
        self.module_batchnorm = module_batchnorm
        self.module_dim = module_dim
        self.condition_method = condition_method
        self.use_gamma = use_gamma
        self.use_beta = use_beta
        self.use_coords_freq = use_coords
        self.debug_every = debug_every
        self.print_verbose_every = print_verbose_every

        # Initialize helper variables
        self.stem_use_coords = (stem_stride
                                == 1) and (self.use_coords_freq > 0)
        self.condition_pattern = condition_pattern
        if len(condition_pattern) == 0:
            self.condition_pattern = []
            for i in range(self.module_num_layers * self.num_modules):
                self.condition_pattern.append(
                    self.condition_method != 'concat')
        else:
            self.condition_pattern = [i > 0 for i in self.condition_pattern]
        self.extra_channel_freq = self.use_coords_freq
        self.block = FiLMedResBlock
        self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0
        self.fwd_count = 0
        self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0
        if self.debug_every <= -1:
            self.print_verbose_every = 1

        # Initialize stem
        stem_feature_dim = feature_dim[
            0] + self.stem_use_coords * self.num_extra_channels
        self.stem = build_stem(stem_feature_dim,
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               subsample_layers=stem_subsample_layers)
        tmp = self.stem(
            Variable(
                torch.zeros(
                    [1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)

        self.stem_coords = coord_map((feature_dim[1], feature_dim[2]))
        self.coords = coord_map((module_H, module_W))
        self.default_weight = torch.ones(1, 1, self.module_dim).to(device)
        self.default_bias = torch.zeros(1, 1, self.module_dim).to(device)

        # Initialize FiLMed network body
        self.function_modules = {}
        self.vocab = vocab
        for fn_num in range(self.num_modules):
            with_cond = self.condition_pattern[self.module_num_layers *
                                               fn_num:self.module_num_layers *
                                               (fn_num + 1)]
            mod = self.block(
                module_dim,
                with_residual=module_residual,
                with_intermediate_batchnorm=module_intermediate_batchnorm,
                with_batchnorm=module_batchnorm,
                with_cond=with_cond,
                dropout=module_dropout,
                num_extra_channels=self.num_extra_channels,
                extra_channel_freq=self.extra_channel_freq,
                with_input_proj=module_input_proj,
                num_cond_maps=self.num_cond_maps,
                kernel_size=module_kernel_size,
                batchnorm_affine=module_batchnorm_affine,
                num_layers=self.module_num_layers,
                condition_method=condition_method,
                debug_every=self.debug_every)
            self.add_module(str(fn_num), mod)
            self.function_modules[fn_num] = mod

        # Initialize output classifier
        self.classifier = build_classifier(module_dim +
                                           self.num_extra_channels,
                                           module_H,
                                           module_W,
                                           num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           classifier_downsample,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)

        init_modules(self.modules())
    def __init__(self,
                 vocab,
                 feature_dim,
                 stem_num_layers,
                 stem_batchnorm,
                 stem_subsample_layers,
                 stem_kernel_size,
                 stem_stride,
                 stem_padding,
                 stem_dim,
                 module_dim,
                 module_kernel_size,
                 module_input_proj,
                 forward_func,
                 use_color,
                 module_residual=True,
                 module_batchnorm=False,
                 classifier_proj_dim=512,
                 classifier_downsample='maxpool2',
                 classifier_fc_layers=(1024, ),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 use_film=False,
                 verbose=True):
        super().__init__()

        self.module_dim = module_dim
        self.func = FUNC_DICT[forward_func]
        self.use_color = use_color

        self.stem = build_stem(feature_dim[0],
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               subsample_layers=stem_subsample_layers,
                               kernel_size=stem_kernel_size,
                               padding=stem_padding,
                               with_batchnorm=stem_batchnorm)
        tmp = self.stem(
            Variable(
                torch.zeros(
                    [1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)

        self.coords = coord_map((module_H, module_W)).unsqueeze(0)

        if verbose:
            print('Here is my stem:')
            print(self.stem)

        num_answers = len(vocab['answer_idx_to_token'])
        self.classifier = build_classifier(module_dim,
                                           module_H,
                                           module_W,
                                           num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           classifier_downsample,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)
        if verbose:
            print('Here is my classifier:')
            print(self.classifier)

        self.unary_function_modules = {}
        self.binary_function_modules = {}
        self.vocab = vocab
        self.use_film = use_film

        if self.use_film:
            unary_mod = FiLMedResBlock(
                module_dim,
                with_residual=module_residual,
                with_intermediate_batchnorm=False,
                with_batchnorm=False,
                with_cond=[True, True],
                num_extra_channels=2,  # was 2 for original film,
                extra_channel_freq=1,
                with_input_proj=module_input_proj,
                num_cond_maps=0,
                kernel_size=module_kernel_size,
                batchnorm_affine=False,
                num_layers=1,
                condition_method='bn-film',
                debug_every=float('inf'))
            binary_mod = ConcatFiLMedResBlock(
                2,
                module_dim,
                with_residual=module_residual,
                with_intermediate_batchnorm=False,
                with_batchnorm=False,
                with_cond=[True, True],
                num_extra_channels=2,  #was 2 for original film,
                extra_channel_freq=1,
                with_input_proj=module_input_proj,
                num_cond_maps=0,
                kernel_size=module_kernel_size,
                batchnorm_affine=False,
                num_layers=1,
                condition_method='bn-film',
                debug_every=float('inf'))

            self.unary_function_modules['film'] = unary_mod
            self.binary_function_modules['film'] = binary_mod
            self.add_module('film_unary', unary_mod)
            self.add_module('film_binary', binary_mod)

        else:
            for fn_str in vocab['program_token_to_idx']:
                arity = self.vocab['program_token_arity'][fn_str]
                if arity == 2 and forward_func == 'tree':
                    binary_mod = ConcatBlock(module_dim,
                                             kernel_size=module_kernel_size,
                                             with_residual=module_residual,
                                             with_batchnorm=module_batchnorm,
                                             use_simple=False)

                    self.add_module(fn_str, binary_mod)
                    self.binary_function_modules[fn_str] = binary_mod

                else:
                    mod = ResidualBlock(module_dim,
                                        kernel_size=module_kernel_size,
                                        with_residual=module_residual,
                                        with_batchnorm=module_batchnorm)

                    self.add_module(fn_str, mod)
                    self.unary_function_modules[fn_str] = mod

        self.declare_film_coefficients()
Beispiel #7
0
    def __init__(self,
                 vocab,
                 feature_dim=(3, 64, 64),
                 stem_num_layers=2,
                 stem_batchnorm=True,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_padding=None,
                 stem_dim=24,
                 module_num_layers=1,
                 module_dim=128,
                 classifier_fc_layers=(1024,),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 rnn_hidden_dim=128,
                 # unused
                 stem_subsample_layers=[],
                 module_input_proj=None,
                 module_residual=None,
                 module_kernel_size=None,
                 module_batchnorm=None,
                 classifier_proj_dim=None,
                 classifier_downsample=None,
                 debug_every=float('inf'),
                 print_verbose_every=float('inf'),
                 verbose=True):
        super().__init__()

        # initialize stem
        self.stem = build_stem(feature_dim[0],
                               stem_dim,
                               stem_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               subsample_layers=stem_subsample_layers)
        tmp = self.stem(Variable(torch.zeros([1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)

        # initialize coordinates to be appended to "objects"
        # can be switched to using torch.meshgrid after 0.4.1
        x = torch.linspace(-1, 1, steps=module_W)
        y = torch.linspace(-1, 1, steps=module_H)
        xv = x.unsqueeze(1).repeat(1, module_H)
        yv = y.unsqueeze(0).repeat(module_W, 1)
        coords = torch.stack([xv,yv], dim=2).view(-1, 2)
        self.coords = Variable(coords.to(device))

        # initialize relation model
        # (output of stem + 2 coordinates) * 2 objects + question vector
        relation_modules = [nn.Linear((stem_dim + 2)*2 + rnn_hidden_dim, module_dim)]
        for _ in range(module_num_layers - 1):
            relation_modules.append(nn.Linear(module_dim, module_dim))
        self.relation = nn.Sequential(*relation_modules)

        # initialize classifier (f_theta)
        num_answers = len(vocab['answer_idx_to_token'])
        self.classifier = build_classifier(module_dim,
                                          1,
                                          1,
                                          num_answers,
                                          classifier_fc_layers,
                                          classifier_proj_dim,
                                          classifier_downsample,
                                          classifier_batchnorm,
                                          classifier_dropout)

        init_modules(self.modules())
Beispiel #8
0
    def __init__(
            self,
            vocab,
            feature_dim=(1024, 14, 14),
            stem_num_layers=2,
            stem_batchnorm=False,
            stem_kernel_size=3,
            stem_stride=1,
            stem_padding=None,
            num_modules=4,
            module_num_layers=1,
            module_dim=128,
            module_residual=True,
            module_batchnorm=False,
            module_batchnorm_affine=False,
            module_dropout=0,
            module_input_proj=1,
            module_kernel_size=3,
            classifier_proj_dim=512,
            classifier_downsample='maxpool2',
            classifier_fc_layers=(1024, ),
            classifier_batchnorm=False,
            classifier_dropout=0,
            condition_method='bn-film',
            condition_pattern=[],
            use_gamma=True,
            use_beta=True,
            use_coords=1,

            # for Language part:
            null_token=0,
            start_token=1,
            end_token=2,
            encoder_embed=None,
            encoder_vocab_size=100,
            decoder_vocab_size=100,
            wordvec_dim=200,
            hidden_dim=512,
            rnn_num_layers=1,
            rnn_dropout=0,
            rnn_time_step=None,
            output_batchnorm=False,
            bidirectional=False,
            encoder_type='gru',
            decoder_type='linear',
            gamma_option='linear',
            gamma_baseline=1,
            parameter_efficient=False,
            debug_every=float('inf'),
            print_verbose_every=float('inf'),
            verbose=True,
    ):
        super(FiLMedNet, self).__init__()

        self.vocab = vocab
        num_answers = len(vocab['answer_idx_to_token'])

        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False
        # for image part
        self.num_modules = num_modules
        self.module_num_layers = module_num_layers
        self.module_batchnorm = module_batchnorm
        self.module_dim = module_dim  # 128
        self.condition_method = condition_method
        self.use_gamma = use_gamma
        self.use_beta = use_beta
        self.use_coords_freq = use_coords  # == 1
        self.feature_dim = feature_dim

        # for language part
        self.encoder_type = encoder_type
        self.decoder_type = decoder_type
        self.output_batchnorm = output_batchnorm
        self.bidirectional = bidirectional
        self.rnn_time_step = rnn_time_step
        self.hidden_dim = hidden_dim
        self.num_dir = 2 if self.bidirectional else 1
        self.gamma_option = gamma_option
        self.gamma_baseline = gamma_baseline  # =1
        self.debug_every = debug_every
        self.NULL = null_token
        self.START = start_token
        self.END = end_token

        self.debug_every = debug_every
        self.print_verbose_every = print_verbose_every

        # initialize rnn
        if self.bidirectional:  # yes
            if decoder_type != 'linear':
                raise (NotImplementedError)
            hidden_dim = (int)(hidden_dim / self.num_dir)

        self.func_list = {
            'linear': None,
            'sigmoid': F.sigmoid,
            'tanh': F.tanh,
            'exp': torch.exp,
            'relu': F.relu
        }

        self.cond_feat_size = 2 * self.module_dim * self.module_num_layers  # FiLM params per ResBlock
        if not parameter_efficient:  # parameter_efficient=False only used to load older trained models
            self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules

        self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim)
        self.encoder_rnn = init_rnn(self.encoder_type,
                                    wordvec_dim,
                                    hidden_dim,
                                    rnn_num_layers,
                                    dropout=rnn_dropout,
                                    bidirectional=self.bidirectional)

        # Initialize stem for rnn
        self.use_rnn_stem = False
        self.stem_rnn_size = int(256 / 2)
        if self.use_rnn_stem:
            self.stem_rnn = init_rnn(self.encoder_type,
                                     self.hidden_dim,
                                     self.stem_rnn_size,
                                     rnn_num_layers,
                                     dropout=rnn_dropout,
                                     bidirectional=self.bidirectional)
            self.hidden_dim = self.stem_rnn_size * 2
            hidden_dim = self.stem_rnn_size

        self.condition_block = {}
        for fn_num in range(self.num_modules):
            mod = nn.Linear(hidden_dim * self.num_dir,
                            self.cond_feat_size)  # gamma, beta for each block.
            self.condition_block[fn_num] = mod
            self.add_module('condition_block_' + str(fn_num), mod)

        # build sentence conditioning for each module:
        self.condition_rnn = {}
        self.cond_rnn_pool = False
        self.modulewise_cond = False
        self.cond_cnn_proj = True
        self.cond_rnn_pool_size = 3
        self.cond_rnn_flatten = Flatten()

        if self.cond_rnn_pool:
            self.cond_rnn_dim_in = self.module_dim * np.floor(
                (feature_dim[1] - self.cond_rnn_pool_size) /
                self.cond_rnn_pool_size +
                1) * np.floor((feature_dim[1] - self.cond_rnn_pool_size) /
                              self.cond_rnn_pool_size + 1)
            self.cond_rnn_dim_in = int(self.cond_rnn_dim_in)
        else:
            self.cond_rnn_dim_in = self.module_dim * feature_dim[
                1] * feature_dim[2]

        self.full_pooling = nn.MaxPool2d(kernel_size=self.cond_rnn_pool_size,
                                         padding=0)

        if self.output_batchnorm:
            self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True)

        # Initialize helper variables
        self.stem_use_coords = (stem_stride
                                == 1) and (self.use_coords_freq > 0)
        self.condition_pattern = condition_pattern
        if len(condition_pattern) == 0:
            self.condition_pattern = []
            for i in range(self.module_num_layers * self.num_modules):
                self.condition_pattern.append(
                    self.condition_method != 'concat')
        else:
            self.condition_pattern = [i > 0 for i in self.condition_pattern]
        self.extra_channel_freq = self.use_coords_freq
        self.block = FiLMedResBlock

        self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0
        self.fwd_count = 0
        self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0  # == 2
        if self.debug_every <= -1:
            self.print_verbose_every = 1
        module_H = feature_dim[1] // (stem_stride**stem_num_layers)
        module_W = feature_dim[2] // (stem_stride**stem_num_layers)
        self.coords = coord_map(
            (module_H,
             module_W))  # size(2,module_H, module_W) expanded linspace.
        self.default_weight = Parameter(
            torch.ones(1, 1, self.module_dim).type(torch.cuda.FloatTensor),
            requires_grad=False)  # to instead film, not used.
        self.default_bias = Parameter(torch.zeros(1, 1, self.module_dim).type(
            torch.cuda.FloatTensor),
                                      requires_grad=False)  # not used.

        # Initialize stem
        stem_feature_dim = feature_dim[
            0] + self.stem_use_coords * self.num_extra_channels  # 1024 + 2
        self.stem = build_stem(
            stem_feature_dim,
            module_dim,
            num_layers=stem_num_layers,
            with_batchnorm=stem_batchnorm,
            kernel_size=stem_kernel_size,
            stride=stem_stride,
            padding=stem_padding
        )  # stem_batchnorm == 1, kernel_size=3, stride=1, padding=None
        # stem: 1-layer CNN converting 1026 channels into 128 channels.

        # Initialize FiLMed network body
        for fn_num in range(self.num_modules):
            with_cond = self.condition_pattern[self.module_num_layers *
                                               fn_num:self.module_num_layers *
                                               (fn_num + 1)]
            mod = self.block(
                module_dim,
                with_residual=module_residual,
                with_batchnorm=module_batchnorm,
                with_cond=with_cond,
                dropout=module_dropout,  # 0e-2
                num_extra_channels=self.num_extra_channels,
                extra_channel_freq=self.extra_channel_freq,
                with_input_proj=module_input_proj,
                num_cond_maps=self.num_cond_maps,
                kernel_size=module_kernel_size,
                batchnorm_affine=module_batchnorm_affine,
                num_layers=self.module_num_layers,
                condition_method=condition_method,
                debug_every=self.debug_every)
            self.add_module('block_' + str(fn_num), mod)

        for fn_num in range(self.num_modules):
            mem = LangMemBlock(self.hidden_dim)
            self.add_module('lang_mem_' + str(fn_num), mem)

        # proj rnn hidden state to common latent space.
        self.rnn_proj = nn.Linear(hidden_dim * self.num_dir,
                                  classifier_proj_dim)
        self.cnn_proj = build_cnn_proj(module_dim + self.num_extra_channels,
                                       module_H,
                                       module_W,
                                       classifier_proj_dim,
                                       classifier_downsample,
                                       with_batchnorm=classifier_batchnorm,
                                       dropout=classifier_dropout)

        # cond_proj_out_dim = self.hidden_dim if self.att_mode == 'simple' else 2 * self.hidden_dim
        cond_proj_out_dim = self.hidden_dim

        if self.cond_cnn_proj:
            if self.modulewise_cond == False:
                self.cnn_proj_for_cond = build_cnn_proj(
                    module_dim + self.num_extra_channels,
                    module_H,
                    module_W,
                    cond_proj_out_dim,
                    classifier_downsample,
                    with_batchnorm=classifier_batchnorm,
                    dropout=classifier_dropout)
            else:
                for fn_num in range(self.num_modules):
                    mod = build_cnn_proj(module_dim + self.num_extra_channels,
                                         module_H,
                                         module_W,
                                         cond_proj_out_dim,
                                         classifier_downsample,
                                         with_batchnorm=classifier_batchnorm,
                                         dropout=classifier_dropout)
                    self.add_module('cnn_proj_for_cond_' + str(fn_num), mod)
        else:
            for fn_num in range(self.num_modules):
                mod = nn.Linear(self.cond_rnn_dim_in, 2 *
                                self.hidden_dim)  # gamma, beta for bi-direct
                self.condition_rnn[fn_num] = mod
                self.add_module('condition_rnn_' + str(fn_num), mod)

        # Initialize output classifier
        self.classifier = build_classifier(num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)

        init_modules(self.modules())
    def __init__(self,
        vocab,
        feature_dim,
        module_dim,
        module_kernel_size,
        stem_dim,
        stem_num_layers,
        stem_subsample_layers,
        stem_kernel_size,
        stem_padding,
        stem_batchnorm,
        classifier_fc_layers,
        classifier_proj_dim,
        classifier_downsample,classifier_batchnorm,
        num_modules,
        hard_code_alpha=False,
        hard_code_tau=False,
        tau_init='random',
        alpha_init='xavier_uniform',
        model_type ='soft',
        model_bernoulli=0.5,
        use_module = 'conv',
        use_stopwords = True,
        **kwargs):

        super().__init__()
        self.num_modules = num_modules
        # alphas and taus from Overleaf Doc.
        self.hard_code_alpha = hard_code_alpha
        self.hard_code_tau = hard_code_tau

        num_question_tokens = 3

        if alpha_init.startswith('correct'):
            print('using correct initialization')
            alpha = INITS[alpha_init](torch.Tensor(num_modules, num_question_tokens))
        elif alpha_init == 'constant':
            alpha = INITS[alpha_init](torch.Tensor(num_modules, num_question_tokens), 1)
        else:
            alpha = INITS[alpha_init](torch.Tensor(num_modules, num_question_tokens))
        print('initial alpha ')
        print(alpha)


        if hard_code_alpha:
            assert(alpha_init.startswith('correct'))

            self.alpha = Variable(alpha)
            self.alpha = self.alpha.to(device)
        else:
            self.alpha = nn.Parameter(alpha)


        # create taus
        if tau_init == 'tree':
            tau_0, tau_1 = _tree_tau()
            print("initializing with tree.")
        elif tau_init == 'chain':
            tau_0, tau_1 = _chain_tau()
            print("initializing with chain")
        elif tau_init == 'chain_with_shortcuts':
            tau_0, tau_1 = _chain_with_shortcuts_tau() 
            print("initializing with chain and shortcuts")

        else:
            tau_0, tau_1 = _random_tau(num_modules)

        if hard_code_tau:
            assert(tau_init in ['chain', 'tree', 'chain_with_shortcuts'])
            self.tau_0 = Variable(tau_0)
            self.tau_1 = Variable(tau_1)
            self.tau_0 = self.tau_0.to(device)
            self.tau_1 = self.tau_1.to(device)
        else:
            self.tau_0   = nn.Parameter(tau_0)
            self.tau_1   = nn.Parameter(tau_1)



        if use_module == 'conv':
            embedding_dim_1 = module_dim + (module_dim*module_dim*module_kernel_size*module_kernel_size)
            embedding_dim_2 = module_dim + (2*module_dim*module_dim)

            question_embeddings_1 = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_1)
            question_embeddings_2 = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_2)

            stdv_1 = 1. / math.sqrt(module_dim*module_kernel_size*module_kernel_size)
            stdv_2 = 1. / math.sqrt(2*module_dim)

            question_embeddings_1.weight.data.uniform_(-stdv_1, stdv_1)
            question_embeddings_2.weight.data.uniform_(-stdv_2, stdv_2)
            self.question_embeddings = nn.Embedding(len(vocab['question_idx_to_token']), embedding_dim_1+embedding_dim_2)
            self.question_embeddings.weight.data = torch.cat([question_embeddings_1.weight.data,
                                                              question_embeddings_2.weight.data],dim=-1)

            self.func = ConvFunc(module_dim, module_kernel_size)

        elif use_module == 'residual':
            embedding_dim_1 = module_dim + (module_dim*module_dim*module_kernel_size*module_kernel_size)
            embedding_dim_2 = module_dim + (2*module_dim*module_dim)

            question_embeddings_a = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_1)
            question_embeddings_b = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_1)
            question_embeddings_2 = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_2)

            stdv_1 = 1. / math.sqrt(module_dim*module_kernel_size*module_kernel_size)
            stdv_2 = 1. / math.sqrt(2*module_dim)

            question_embeddings_a.weight.data.uniform_(-stdv_1, stdv_1)
            question_embeddings_b.weight.data.uniform_(-stdv_1, stdv_1)
            question_embeddings_2.weight.data.uniform_(-stdv_2, stdv_2)
            self.question_embeddings = nn.Embedding(len(vocab['question_idx_to_token']), 2*embedding_dim_1+embedding_dim_2)
            self.question_embeddings.weight.data = torch.cat([question_embeddings_a.weight.data, question_embeddings_b.weight.data,
                                                              question_embeddings_2.weight.data],dim=-1)
            self.func = ResidualFunc(module_dim, module_kernel_size)

        else:
            self.question_embeddings = nn.Embedding(len(vocab['question_idx_to_token']), module_dim)
            self.func = FindModule(module_dim, module_kernel_size)


        # stem for processing the image into a 3D tensor
        self.stem = build_stem(feature_dim[0], stem_dim, module_dim,
                   num_layers=stem_num_layers,
                   subsample_layers=stem_subsample_layers,
                   kernel_size=stem_kernel_size,
                   padding=stem_padding,
                   with_batchnorm=stem_batchnorm)

        tmp = self.stem(Variable(torch.zeros([1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)
        num_answers = len(vocab['answer_idx_to_token'])
        self.classifier = build_classifier(module_dim, module_H, module_W, num_answers,
                  classifier_fc_layers,
                  classifier_proj_dim,
                  classifier_downsample,
                  with_batchnorm=classifier_batchnorm)

        self.model_type = model_type
        self.use_module = use_module
        p = model_bernoulli
        tree_odds = -numpy.log((1 - p) / p)
        self.tree_odds = nn.Parameter(torch.Tensor([tree_odds]))
Beispiel #10
0
  def __init__(self, vocab, feature_dim=(1024, 14, 14),
               stem_use_resnet=False,
               stem_resnet_fixed=False,
               resnet_model_stage=3,
               stem_num_layers=2,
               stem_batchnorm=False,
               stem_kernel_size=3,
               stem_stride=1,
               stem_stride2_freq=0,
               stem_padding=None,
               module_dim=128,
               module_residual=True,
               module_batchnorm=False,
               classifier_proj_dim=512,
               classifier_downsample='maxpool2',
               classifier_fc_layers=(1024,),
               classifier_batchnorm=False,
               classifier_dropout=0,
               verbose=True):
    super(ModuleNet, self).__init__()

    self.stem = build_stem(stem_use_resnet, stem_resnet_fixed, feature_dim[0], module_dim,
                           resnet_model_stage=resnet_model_stage, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm,
                           kernel_size=stem_kernel_size, stride=stem_stride, stride2_freq=stem_stride2_freq, padding=stem_padding)
    if verbose:
      print('Here is my stem:')
      print(self.stem)

    if stem_stride2_freq > 0:
      module_H = feature_dim[1] // (2 ** (stem_num_layers // stem_stride2_freq))
      module_W = feature_dim[2] // (2 ** (stem_num_layers // stem_stride2_freq))
    else:
      module_H = feature_dim[1]
      module_W = feature_dim[2]

    num_answers = len(vocab['answer_idx_to_token'])
    self.classifier = build_classifier(module_dim, module_H, module_W, num_answers,
                                       classifier_fc_layers,
                                       classifier_proj_dim,
                                       classifier_downsample,
                                       with_batchnorm=classifier_batchnorm,
                                       dropout=classifier_dropout)
    if verbose:
      print('Here is my classifier:')
      print(self.classifier)
    self.stem_times = []
    self.module_times = []
    self.classifier_times = []
    self.timing = False

    self.function_modules = {}
    self.function_modules_num_inputs = dict(vocab['program_token_num_inputs'])
    self.vocab = vocab
    self.scene = None
    for fn_str in vocab['program_token_to_idx']:
      # num_inputs = vr.programs.get_num_inputs(fn_str)
      # self.function_modules_num_inputs[fn_str] = num_inputs
      num_inputs = self.function_modules_num_inputs[fn_str]
      if num_inputs == 0 and self.scene is None:
        self.scene = fn_str
      elif fn_str == 'scene':
        self.scene = fn_str
      if num_inputs == 0 or num_inputs == 1:
      # if fn_str == 'scene' or num_inputs == 1:
        mod = ResidualBlock(module_dim,
                with_residual=module_residual,
                with_batchnorm=module_batchnorm)
      elif num_inputs >= 2:
        mod = ConcatBlock(num_inputs, module_dim,
                with_residual=module_residual,
                with_batchnorm=module_batchnorm)
      self.add_module(fn_str, mod)
      self.function_modules[fn_str] = mod

    self.save_module_outputs = False
Beispiel #11
0
    def __init__(self,
                 vocab,
                 feature_dim,
                 use_film,
                 use_simple_block,
                 stem_num_layers,
                 stem_batchnorm,
                 stem_subsample_layers,
                 stem_kernel_size,
                 stem_stride,
                 stem_padding,
                 stem_dim,
                 module_dim,
                 module_pool,
                 module_use_gammas,
                 module_kernel_size,
                 module_input_proj,
                 module_residual=True,
                 module_batchnorm=False,
                 module_num_layers=1,
                 mod_id_loss=False,
                 kl_loss=False,
                 learn_control=False,
                 rnn_dim=None,
                 classifier_proj_dim=512,
                 classifier_downsample='maxpool2',
                 classifier_fc_layers=(1024, ),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 discriminator_proj_dim=None,
                 discriminator_downsample=None,
                 discriminator_fc_layers=None,
                 discriminator_dropout=None,
                 verbose=True,
                 type_anonymizer=False):
        super(ModuleNet, self).__init__()

        if discriminator_proj_dim is None:
            discriminator_proj_dim = classifier_proj_dim
        if discriminator_downsample is None:
            discriminator_downsample = classifier_downsample
        if discriminator_fc_layers is None:
            discriminator_fc_layers = classifier_fc_layers
        if discriminator_dropout is None:
            discriminator_dropout = classifier_dropout

        self.module_dim = module_dim
        self.use_film = use_film
        self.use_simple_block = use_simple_block
        self.mod_id_loss = mod_id_loss
        self.kl_loss = kl_loss
        self.learn_control = learn_control

        self.stem = build_stem(feature_dim[0],
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               subsample_layers=stem_subsample_layers,
                               kernel_size=stem_kernel_size,
                               padding=stem_padding,
                               with_batchnorm=stem_batchnorm)
        tmp = self.stem(
            Variable(
                torch.zeros(
                    [1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)

        self.coords = coord_map((module_H, module_W))

        if verbose:
            print('Here is my stem:')
            print(self.stem)

        classifier_kwargs = dict(module_C=module_dim,
                                 module_H=module_H,
                                 module_W=module_W,
                                 num_answers=len(vocab['answer_idx_to_token']),
                                 fc_dims=classifier_fc_layers,
                                 proj_dim=classifier_proj_dim,
                                 downsample=classifier_downsample,
                                 with_batchnorm=classifier_batchnorm,
                                 dropout=classifier_dropout)
        discriminator_kwargs = dict(module_C=module_dim,
                                    module_H=module_H,
                                    module_W=module_W,
                                    num_answers=len(
                                        vocab['program_idx_to_token']),
                                    fc_dims=discriminator_fc_layers,
                                    proj_dim=discriminator_proj_dim,
                                    downsample=discriminator_downsample,
                                    with_batchnorm=False,
                                    dropout=discriminator_dropout)
        if self.use_film:
            classifier_kwargs['module_H'] = 1
            classifier_kwargs['module_W'] = 1
            discriminator_kwargs['module_H'] = 1
            discriminator_kwargs['module_W'] = 1

        self.classifier = build_classifier(**classifier_kwargs)
        if self.mod_id_loss:
            self.module_identifier = build_classifier(**discriminator_kwargs)

        if verbose:
            print('Here is my classifier:')
            print(self.classifier)

        self.function_modules = {}
        self.function_modules_num_inputs = {}
        self.vocab = vocab

        shared_block = None
        if type_anonymizer:
            shared_block = ResidualBlock(module_dim,
                                         kernel_size=module_kernel_size,
                                         with_residual=module_residual,
                                         with_batchnorm=module_batchnorm)
        elif use_film == 1:
            assert module_W == module_H
            shared_block = SharedFiLMedModule(
                module_dim,
                kernel_size=module_kernel_size,
                num_layers=module_num_layers,
                with_residual=module_residual,
                pool=module_pool,
                use_gammas=module_use_gammas,
                post_linear=kl_loss,
                learn_embeddings=not learn_control)
        if shared_block:
            self.shared_block = shared_block
            self.add_module('shared', shared_block)

        for fn_str, fn_idx in vocab['program_token_to_idx'].items():
            num_inputs = vocab['program_token_arity'][fn_str]
            self.function_modules_num_inputs[fn_str] = num_inputs

            def create_module():
                if num_inputs > 2:
                    raise Exception('Not implemented!')

                if use_film == 1:
                    return FiLMModule(shared_block, fn_idx)

                if use_film == 2:
                    separate_core_block = SharedFiLMedModule(
                        module_dim,
                        module_W,
                        kernel_size=module_kernel_size,
                        with_residual=module_residual)
                    return FiLMModule(separate_core_block, fn_idx)

                if use_simple_block:
                    # brutally simple concatentation block
                    # with 2 layers, no residual connection
                    return SimpleConcatBlock(module_dim,
                                             kernel_size=module_kernel_size)

                if num_inputs in [0, 1]:
                    return ResidualBlock(module_dim,
                                         kernel_size=module_kernel_size,
                                         with_residual=module_residual,
                                         with_batchnorm=module_batchnorm,
                                         shared_block=shared_block,
                                         post_linear=kl_loss)
                else:
                    return ConcatBlock(module_dim,
                                       kernel_size=module_kernel_size,
                                       with_residual=module_residual,
                                       with_batchnorm=module_batchnorm,
                                       shared_block=shared_block,
                                       post_linear=kl_loss)

            mod = create_module()
            if mod is not None:
                self.add_module(fn_str, mod)
                self.function_modules[fn_str] = mod

        self.save_module_outputs = False
        self.noise_enabled = True

        if learn_control:
            self.controller = MACControl(30, rnn_dim, module_dim)
Beispiel #12
0
    def __init__(self,
                 vocab,
                 rnn_wordvec_dim=300,
                 rnn_dim=256,
                 rnn_num_layers=2,
                 rnn_dropout=0,
                 feature_dim=(1024, 14, 14),
                 stem_module_dim=128,
                 stem_use_resnet=False,
                 stem_resnet_fixed=False,
                 resnet_model_stage=3,
                 stem_num_layers=2,
                 stem_batchnorm=False,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_stride2_freq=0,
                 stem_padding=None,
                 use_coords=0,
                 film=False,
                 stacked_attn_dim=512,
                 num_stacked_attn=2,
                 sa_kernel_size=1,
                 fc_use_batchnorm=False,
                 fc_dropout=0,
                 fc_dims=(1024, )):
        super(CnnLstmSaModel, self).__init__()
        rnn_kwargs = {
            'token_to_idx': vocab['question_token_to_idx'],
            'wordvec_dim': rnn_wordvec_dim,
            'rnn_dim': rnn_dim,
            'rnn_num_layers': rnn_num_layers,
            'rnn_dropout': rnn_dropout,
        }
        self.rnn = LstmEncoder(**rnn_kwargs)

        self.stem = build_stem(stem_use_resnet,
                               stem_resnet_fixed,
                               feature_dim[0],
                               stem_module_dim,
                               resnet_model_stage=resnet_model_stage,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               stride2_freq=stem_stride2_freq,
                               padding=stem_padding)

        if stem_stride2_freq > 0:
            module_H = feature_dim[1] // (2**(stem_num_layers //
                                              stem_stride2_freq))
            module_W = feature_dim[2] // (2**(stem_num_layers //
                                              stem_stride2_freq))
        else:
            module_H = feature_dim[1]
            module_W = feature_dim[2]

        if use_coords == 1:
            self.coords = coord_map((module_H, module_W))
        else:
            use_coords = 0
            self.coords = None

        self.image_proj = nn.Conv2d(stem_module_dim,
                                    stacked_attn_dim - 2 * use_coords,
                                    kernel_size=1,
                                    padding=0)
        self.ques_proj = nn.Linear(rnn_dim, stacked_attn_dim)
        self.stacked_attns = []
        for i in range(num_stacked_attn):
            sa = StackedAttention(stacked_attn_dim,
                                  stacked_attn_dim,
                                  kernel_size=sa_kernel_size,
                                  film=film)
            # sa = StackedAttention(rnn_dim, stacked_attn_dim)
            self.stacked_attns.append(sa)
            self.add_module('stacked-attn-%d' % i, sa)

        self.classifier = build_classifier(module_C=stacked_attn_dim,
                                           module_H=None,
                                           module_W=None,
                                           num_answers=len(
                                               vocab['answer_token_to_idx']),
                                           fc_dims=fc_dims,
                                           proj_dim=None,
                                           downsample=None,
                                           with_batchnorm=fc_use_batchnorm,
                                           dropout=fc_dropout)
        init_modules(self.modules(), init='normal')
Beispiel #13
0
    def __init__(self,
                 vocab,
                 rnn_wordvec_dim=300,
                 rnn_dim=256,
                 rnn_num_layers=2,
                 rnn_dropout=0,
                 feature_dim=(1024, 14, 14),
                 stem_module_dim=128,
                 stem_use_resnet=False,
                 stem_resnet_fixed=False,
                 resnet_model_stage=3,
                 stem_num_layers=2,
                 stem_batchnorm=False,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_stride2_freq=0,
                 stem_padding=None,
                 use_coords=None,
                 film=False,
                 cl_kernel_size=1,
                 cl_early_fusion=False,
                 relational_module=False,
                 rel_image_dim=24,
                 rel_module_dim=256,
                 rel_num_layers=4,
                 multimodal_core=False,
                 mc_module_dim=256,
                 mc_num_layers=4,
                 mc_batchnorm=True,
                 mc_kernel_size=1,
                 fc_dims=(1024, ),
                 fc_use_batchnorm=False,
                 fc_dropout=0):
        super(CnnLstmModel, self).__init__()
        if film:
            if relational_module:
                rnn_dim = 2 * stem_module_dim + 8 * use_coords
            elif multimodal_core:
                rnn_dim = 2 * stem_module_dim + 4 * use_coords
            else:
                rnn_dim = 2 * stem_module_dim
        rnn_kwargs = {
            'token_to_idx': vocab['question_token_to_idx'],
            'wordvec_dim': rnn_wordvec_dim,
            'rnn_dim': rnn_dim,
            'rnn_num_layers': rnn_num_layers,
            'rnn_dropout': rnn_dropout,
        }
        self.rnn = LstmEncoder(**rnn_kwargs)

        self.stem = build_stem(stem_use_resnet,
                               stem_resnet_fixed,
                               feature_dim[0],
                               stem_module_dim,
                               resnet_model_stage=resnet_model_stage,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               stride2_freq=stem_stride2_freq,
                               padding=stem_padding)

        if stem_stride2_freq > 0:
            module_H = feature_dim[1] // (2**(stem_num_layers //
                                              stem_stride2_freq))
            module_W = feature_dim[2] // (2**(stem_num_layers //
                                              stem_stride2_freq))
        else:
            module_H = feature_dim[1]
            module_W = feature_dim[2]

        if use_coords == 1 or (use_coords is None and relational_module):
            use_coords = 1
            self.coords = coord_map((module_H, module_W))
        else:
            use_coords = 0
            self.coords = None

        if film:
            self.film = FiLM()
        else:
            self.film = None

        assert not relational_module or not multimodal_core
        self.relational_module = relational_module
        self.multimodal_core = multimodal_core

        if self.relational_module:
            # https://arxiv.org/abs/1706.01427
            self.conv = nn.Conv2d(stem_module_dim,
                                  rel_image_dim,
                                  kernel_size=1,
                                  padding=0)
            if film:
                self.rel = build_relational_module(
                    feature_dim=((rel_image_dim + 2 * use_coords) * 2),
                    module_dim=rel_module_dim,
                    num_layers=rel_num_layers)
            else:
                self.rel = build_relational_module(
                    feature_dim=((rel_image_dim + 2 * use_coords) * 2 +
                                 rnn_dim),
                    module_dim=rel_module_dim,
                    num_layers=rel_num_layers)
            module_C = rel_module_dim

        elif self.multimodal_core:
            # https://arxiv.org/abs/1809.04482
            if film:
                self.mc = build_multimodal_core(feature_dim=(stem_module_dim +
                                                             2 * use_coords),
                                                module_dim=mc_module_dim,
                                                num_layers=mc_num_layers,
                                                with_batchnorm=mc_batchnorm,
                                                kernel_size=mc_kernel_size)
            else:
                self.mc = build_multimodal_core(
                    feature_dim=(stem_module_dim + rnn_dim + 2 * use_coords),
                    module_dim=mc_module_dim,
                    num_layers=mc_num_layers,
                    with_batchnorm=mc_batchnorm,
                    kernel_size=mc_kernel_size)
            module_C = mc_module_dim

        else:
            self.early_fusion = cl_early_fusion
            if cl_early_fusion and not film:
                self.conv = nn.Conv2d(stem_module_dim + 2 * use_coords +
                                      rnn_dim,
                                      stem_module_dim,
                                      kernel_size=cl_kernel_size,
                                      padding=cl_kernel_size // 2)
                module_C = stem_module_dim
            else:
                self.conv = nn.Conv2d(stem_module_dim + 2 * use_coords,
                                      stem_module_dim,
                                      kernel_size=cl_kernel_size,
                                      padding=cl_kernel_size // 2)
                if cl_early_fusion or film:
                    module_C = stem_module_dim
                else:
                    module_C = stem_module_dim + rnn_dim
            self.pool = nn.MaxPool2d(kernel_size=(module_H, module_W),
                                     stride=(module_H, module_W))

        self.classifier = build_classifier(module_C=module_C,
                                           module_H=None,
                                           module_W=None,
                                           num_answers=len(
                                               vocab['answer_token_to_idx']),
                                           fc_dims=fc_dims,
                                           proj_dim=None,
                                           downsample=None,
                                           with_batchnorm=fc_use_batchnorm,
                                           dropout=fc_dropout)
Beispiel #14
0
    def __init__(self,
                 vocab,
                 feature_dim,
                 use_film,
                 use_simple_block,
                 sharing_patterns,
                 stem_num_layers,
                 stem_batchnorm,
                 stem_subsample_layers,
                 stem_kernel_size,
                 stem_stride,
                 stem_padding,
                 stem_dim,
                 module_dim,
                 module_kernel_size,
                 module_input_proj,
                 module_residual=True,
                 module_batchnorm=False,
                 classifier_proj_dim=512,
                 classifier_downsample='maxpool2',
                 classifier_fc_layers=(1024, ),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 verbose=True):
        super(ModuleNet, self).__init__()

        self.module_dim = module_dim

        # should be 0 or 1 to indicate the use of film block or not (0 would bring you back to the original EE model)
        self.use_film = use_film
        # should be 0 or 1 to indicate if we are using ResNets or a simple 3x3 conv followed by ReLU
        self.use_simple_block = use_simple_block

        # this should be a list of two elements (either 0 or 1). It's only active if self.use_film == 1
        # The first element of 1 indicates the sharing of CNN weights in the film blocks, 0 otheriwse
        # The second element of 1 indicate the sharing of film coefficient in the film blocks, 0 otherwise
        # so [1,0] would be sharing the CNN weights while having different film coefficients for different modules in the program
        self.sharing_patterns = sharing_patterns

        self.stem = build_stem(feature_dim[0],
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               subsample_layers=stem_subsample_layers,
                               kernel_size=stem_kernel_size,
                               padding=stem_padding,
                               with_batchnorm=stem_batchnorm)
        tmp = self.stem(
            Variable(
                torch.zeros(
                    [1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)

        self.coords = coord_map((module_H, module_W))

        if verbose:
            print('Here is my stem:')
            print(self.stem)

        num_answers = len(vocab['answer_idx_to_token'])
        self.classifier = build_classifier(module_dim,
                                           module_H,
                                           module_W,
                                           num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           classifier_downsample,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)
        if verbose:
            print('Here is my classifier:')
            print(self.classifier)
        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False

        self.function_modules = {}
        self.function_modules_num_inputs = {}
        self.fn_str_2_filmId = {}
        self.vocab = vocab
        for fn_str in vocab['program_token_to_idx']:
            num_inputs = vocab['program_token_arity'][fn_str]
            self.function_modules_num_inputs[fn_str] = num_inputs

            if self.use_film:
                if self.sharing_patterns[1] == 1:
                    self.fn_str_2_filmId[fn_str] = 0
                else:
                    self.fn_str_2_filmId[fn_str] = len(self.fn_str_2_filmId)

            if fn_str == 'scene' or num_inputs == 1:
                if self.use_film:
                    if self.sharing_patterns[0] == 1:
                        mod = None
                    else:
                        mod = FiLMedResBlock(
                            module_dim,
                            with_residual=module_residual,
                            with_intermediate_batchnorm=False,
                            with_batchnorm=False,
                            with_cond=[True, True],
                            num_extra_channels=2,  # was 2 for original film,
                            extra_channel_freq=1,
                            with_input_proj=module_input_proj,
                            num_cond_maps=0,
                            kernel_size=module_kernel_size,
                            batchnorm_affine=False,
                            num_layers=1,
                            condition_method='bn-film',
                            debug_every=float('inf'))
                else:
                    if self.use_simple_block:
                        mod = SimpleVisualBlock(module_dim,
                                                kernel_size=module_kernel_size)
                    else:
                        mod = ResidualBlock(module_dim,
                                            kernel_size=module_kernel_size,
                                            with_residual=module_residual,
                                            with_batchnorm=module_batchnorm)
            elif num_inputs == 2:
                if self.use_film:
                    if self.sharing_patterns[0] == 1:
                        mod = None
                    else:
                        mod = ConcatFiLMedResBlock(
                            2,
                            module_dim,
                            with_residual=module_residual,
                            with_intermediate_batchnorm=False,
                            with_batchnorm=False,
                            with_cond=[True, True],
                            num_extra_channels=2,  #was 2 for original film,
                            extra_channel_freq=1,
                            with_input_proj=module_input_proj,
                            num_cond_maps=0,
                            kernel_size=module_kernel_size,
                            batchnorm_affine=False,
                            num_layers=1,
                            condition_method='bn-film',
                            debug_every=float('inf'))
                else:
                    mod = ConcatBlock(module_dim,
                                      kernel_size=module_kernel_size,
                                      with_residual=module_residual,
                                      with_batchnorm=module_batchnorm)
            else:
                raise Exception('Not implemented!')

            if mod is not None:
                self.add_module(fn_str, mod)
                self.function_modules[fn_str] = mod

        if self.use_film and self.sharing_patterns[0] == 1:
            mod = ConcatFiLMedResBlock(
                2,
                module_dim,
                with_residual=module_residual,
                with_intermediate_batchnorm=False,
                with_batchnorm=False,
                with_cond=[True, True],
                num_extra_channels=2,  #was 2 for original film,
                extra_channel_freq=1,
                with_input_proj=module_input_proj,
                num_cond_maps=0,
                kernel_size=module_kernel_size,
                batchnorm_affine=False,
                num_layers=1,
                condition_method='bn-film',
                debug_every=float('inf'))
            self.add_module('shared_film', mod)
            self.function_modules['shared_film'] = mod

        self.declare_film_coefficients()

        self.save_module_outputs = False
    def __init__(
        self,
        vocab,
        feature_dim,
        stem_num_layers,
        stem_batchnorm,
        stem_kernel_size,
        stem_subsample_layers,
        stem_stride,
        stem_padding,
        stem_dim,
        num_modules,
        module_dim,
        question_embedding_dropout,
        stem_dropout,
        memory_dropout,
        read_dropout,
        nonlinearity,
        use_prior_control_in_control_unit,
        use_self_attention,
        use_memory_gate,
        question2output,
        classifier_batchnorm,
        classifier_fc_layers,
        classifier_dropout,
        use_coords,
        write_unit,
        read_connect,
        noisy_controls,
        debug_every=float('inf'),
        print_verbose_every=float('inf'),
        hard_code_control=False,
        verbose=True,
    ):
        super().__init__()

        num_answers = len(vocab['answer_idx_to_token'])

        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False

        self.num_modules = num_modules

        self.question_embedding_dropout = question_embedding_dropout
        self.memory_dropout = memory_dropout
        self.read_dropout = read_dropout

        self.module_dim = module_dim

        self.read_connect = read_connect
        self.question2output = question2output
        self.use_self_attention = use_self_attention == 1
        self.use_memory_gate = use_memory_gate == 1

        self.use_coords_freq = use_coords
        self.debug_every = debug_every
        self.print_verbose_every = print_verbose_every

        # Initialize helper variables
        self.stem_use_coords = self.use_coords_freq
        self.extra_channel_freq = self.use_coords_freq

        self.fwd_count = 0
        self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0
        if self.debug_every <= -1:
            self.print_verbose_every = 1

        # Initialize stem
        stem_feature_dim = feature_dim[
            0] + self.stem_use_coords * self.num_extra_channels
        self.stem = build_stem(stem_feature_dim,
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               subsample_layers=stem_subsample_layers,
                               acceptEvenKernel=True)

        #Define units
        self.inputUnits = []
        for i in range(self.num_modules):
            mod = InputUnit(module_dim)
            self.add_module('InputUnit' + str(i + 1), mod)
            self.inputUnits.append(mod)

        self.controlUnit = ControlUnit(
            module_dim,
            use_prior_control_in_control_unit=use_prior_control_in_control_unit
        )
        self.readUnit = ReadUnit(module_dim, nonlinearity, self.read_dropout)

        if write_unit == 'original':
            mod = WriteUnit(module_dim,
                            use_self_attention=self.use_self_attention,
                            use_memory_gate=self.use_memory_gate)
        elif write_unit == 'gru':
            mod = GRUWriteUnit(module_dim)
        elif write_unit == 'lastread':
            mod = LastReadWriteUnit()
        elif write_unit == 'noop':
            mod = NoOpWriteUnit()
        else:
            raise ValueError(mod)
        self.add_module('WriteUnit', mod)
        self.writeUnit = mod

        #parameters for initial memory and control vectors
        self.init_memory = nn.Parameter(torch.randn(module_dim).to(device))

        #first transformation of question embeddings
        self.init_question_transformer = nn.Linear(self.module_dim,
                                                   self.module_dim)
        self.init_question_non_linear = nn.Tanh()

        self.vocab = vocab

        self.question_embedding_dropout_module = nn.Dropout(
            p=self.question_embedding_dropout)

        # Initialize output classifier
        self.classifier = OutputUnit(module_dim,
                                     classifier_fc_layers,
                                     num_answers,
                                     with_batchnorm=classifier_batchnorm,
                                     dropout=classifier_dropout,
                                     nonlinearity=nonlinearity,
                                     question2output=question2output)

        self.pre_connects = []
        self.part_transforms = []
        num_pre_connect = {'last': 0, 'one': 1, 'two': 2}
        for i in range(num_pre_connect[read_connect]):
            mod = nn.Linear(self.module_dim, self.module_dim)
            self.pre_connects.append(mod)
            self.add_module("pre_connect_" + str(i), mod)
        if read_connect == 'two':
            # if we connect a module to two modules, we should transform their contributions
            # differently to preserve the information about
            # what comes from the left and what comes from the right
            for i in range(2):
                mod = nn.Linear(self.module_dim, self.module_dim)
                self.part_transforms.append(mod)
                self.add_module("part_transform_" + str(i), mod)

        self.hard_code_control = hard_code_control
        self.noisy_controls = noisy_controls
        if noisy_controls:
            self.compute_mu = nn.Linear(self.module_dim, self.module_dim)
            self.compute_logvar = nn.Linear(self.module_dim, self.module_dim)

        init_modules(self.modules())
Beispiel #16
0
    def __init__(
            self,
            vocab,
            feature_dim=(1024, 14, 14),
            stem_num_layers=2,
            stem_batchnorm=False,
            stem_kernel_size=3,
            stem_stride=1,
            stem_padding=None,
            num_modules=4,
            max_program_module_arity=2,
            max_program_tree_depth=5,
            module_num_layers=1,
            module_dim=128,
            module_residual=True,
            module_batchnorm=False,
            module_batchnorm_affine=False,
            module_dropout=0,
            module_input_proj=1,
            module_kernel_size=3,
            classifier_proj_dim=512,
            classifier_downsample='maxpool2',
            classifier_fc_layers=(1024, ),
            classifier_batchnorm=False,
            classifier_dropout=0,
            condition_method='bn-film',
            condition_pattern=[],
            use_gamma=True,
            use_beta=True,
            use_coords=1,
            debug_every=float('inf'),
            print_verbose_every=float('inf'),
            verbose=True,
    ):
        super(TFiLMedNet, self).__init__()

        num_answers = len(vocab['answer_idx_to_token'])

        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False

        self.num_modules = num_modules

        self.max_program_module_arity = max_program_module_arity
        self.max_program_tree_depth = max_program_tree_depth

        self.module_num_layers = module_num_layers
        self.module_batchnorm = module_batchnorm
        self.module_dim = module_dim
        self.condition_method = condition_method
        self.use_gamma = use_gamma
        self.use_beta = use_beta
        self.use_coords_freq = use_coords
        self.debug_every = debug_every
        self.print_verbose_every = print_verbose_every

        # Initialize helper variables
        self.stem_use_coords = (stem_stride
                                == 1) and (self.use_coords_freq > 0)
        self.condition_pattern = condition_pattern
        if len(condition_pattern) == 0:
            self.condition_pattern = []

            for i in range(self.max_program_tree_depth):
                idepth = []
                for j in range(self.max_program_module_arity):
                    ijarity = [[self.condition_method != 'concat'] * 2
                               ] * self.module_num_layers
                    idepth.append(ijarity)
                self.condition_pattern.append(idepth)

        else:
            for i in range(self.max_program_tree_depth):
                for j in range(self.max_program_module_arity):
                    self.condition_pattern[i][j] = [
                        k > 0 for k in self.condition_pattern[i][j]
                    ]
        self.extra_channel_freq = self.use_coords_freq
        #self.block = FiLMedResBlock
        self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0
        self.fwd_count = 0
        self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0
        if self.debug_every <= -1:
            self.print_verbose_every = 1
        module_H = feature_dim[1] // (stem_stride**stem_num_layers
                                      )  # Rough calc: work for main cases
        module_W = feature_dim[2] // (stem_stride**stem_num_layers
                                      )  # Rough calc: work for main cases
        self.coords = coord_map((module_H, module_W))
        self.default_weight = Variable(torch.ones(1, 1, self.module_dim)).type(
            torch.cuda.FloatTensor)
        self.default_bias = Variable(torch.zeros(1, 1, self.module_dim)).type(
            torch.cuda.FloatTensor)

        # Initialize stem
        stem_feature_dim = feature_dim[
            0] + self.stem_use_coords * self.num_extra_channels
        self.stem = build_stem(stem_feature_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding)

        # Initialize FiLMed network body
        self.function_modules = {}
        self.vocab = vocab
        #for fn_num in range(self.num_modules):

        mod = ResidualBlock(module_dim,
                            with_residual=module_residual,
                            with_batchnorm=module_batchnorm)
        self.add_module('0', mod)
        self.function_modules['0'] = mod

        for dep in range(self.max_program_tree_depth):
            for art in range(self.max_program_module_arity):
                with_cond = self.condition_pattern[dep][art]
                if art == 0:
                    mod = ResidualBlock(module_dim,
                                        with_residual=module_residual,
                                        with_batchnorm=module_batchnorm)
                else:
                    mod = ConcatBlock(art + 1,
                                      module_dim,
                                      with_residual=module_residual,
                                      with_batchnorm=module_batchnorm)
                ikey = str(dep + 1) + '-' + str(art + 1)
                self.add_module(ikey, mod)
                self.function_modules[ikey] = mod

        # Initialize output classifier
        self.classifier = build_classifier(module_dim +
                                           self.num_extra_channels,
                                           module_H,
                                           module_W,
                                           num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           classifier_downsample,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)

        init_modules(self.modules())