Example #1
0
 def __init__(self, input_dim, hidden_dim):
     super(StackedAttention, self).__init__()
     self.Wv = nn.Conv2d(input_dim, hidden_dim, kernel_size=1, padding=0)
     self.Wu = nn.Linear(input_dim, hidden_dim)
     self.Wp = nn.Conv2d(hidden_dim, 1, kernel_size=1, padding=0)
     self.hidden_dim = hidden_dim
     self.attention_maps = None
     init_modules(self.modules(), init='normal')
Example #2
0
    def __init__(self, hidden_dim, with_batchnorm=True):
        super(LangMemBlock, self).__init__()
        self.hidden_dim = hidden_dim
        self.with_batchnorm = with_batchnorm
        # batch norm for rnn feature maps
        self.rnn_batch_norm = nn.BatchNorm1d(self.hidden_dim, affine=False)

        # self.att_mode = 'simple'
        self.att_mode = 'paramed'
        self.compute_attention = nn.Linear(self.hidden_dim, 1)

        init_modules(self.modules())
Example #3
0
  def __init__(self, in_dim, out_dim=None, with_residual=True, with_batchnorm=True,
               with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1,
               with_input_proj=0, num_cond_maps=0, kernel_size=3, batchnorm_affine=False,
               num_layers=1, condition_method='bn-film', debug_every=float('inf')):
    if out_dim is None:
      out_dim = in_dim
    super(FiLMedResBlock, self).__init__()
    self.with_residual = with_residual
    self.with_batchnorm = with_batchnorm
    self.with_cond = with_cond
    self.dropout = dropout
    self.extra_channel_freq = 0 if num_extra_channels == 0 else extra_channel_freq
    self.with_input_proj = with_input_proj  # Kernel size of input projection
    self.num_cond_maps = num_cond_maps
    self.kernel_size = kernel_size
    self.batchnorm_affine = batchnorm_affine
    self.num_layers = num_layers
    self.condition_method = condition_method
    self.debug_every = debug_every

    if self.with_input_proj % 2 == 0:
      raise(NotImplementedError)
    if self.kernel_size % 2 == 0:
      raise(NotImplementedError)
    if self.num_layers >= 2:
      raise(NotImplementedError)

    if self.condition_method == 'block-input-film' and self.with_cond[0]:
      self.film = FiLM()
    if self.with_input_proj:
      self.input_proj = nn.Conv2d(in_dim + (num_extra_channels if self.extra_channel_freq >= 1 else 0),
                                  in_dim, kernel_size=self.with_input_proj, padding=self.with_input_proj // 2)

    self.conv1 = nn.Conv2d(in_dim + self.num_cond_maps +
                           (num_extra_channels if self.extra_channel_freq >= 2 else 0),
                            out_dim, kernel_size=self.kernel_size,
                            padding=self.kernel_size // 2)
    if self.condition_method == 'conv-film' and self.with_cond[0]:
      self.film = FiLM()
    if self.with_batchnorm:
      self.bn1 = nn.BatchNorm2d(out_dim, affine=((not self.with_cond[0]) or self.batchnorm_affine))
    if self.condition_method == 'bn-film' and self.with_cond[0]:
      self.film = FiLM()
    if dropout > 0:
      self.drop = nn.Dropout2d(p=self.dropout)
    if ((self.condition_method == 'relu-film' or self.condition_method == 'block-output-film')
         and self.with_cond[0]):
      self.film = FiLM()

    init_modules(self.modules())
Example #4
0
 def __init__(self, input_dim, hidden_dim, kernel_size=1, film=False):
     super(StackedAttention, self).__init__()
     self.Wv = nn.Conv2d(input_dim, hidden_dim, kernel_size=1, padding=0)
     # self.Wv = nn.Conv2d(hidden_dim, hidden_dim, kernel_size=1, padding=0)
     # self.Wu = nn.Linear(input_dim, hidden_dim)
     if film:
         self.Wu = nn.Linear(input_dim, 2 * hidden_dim)
         self.film = FiLM()
     else:
         self.Wu = nn.Linear(input_dim, hidden_dim)
         self.film = None
     self.Wp = nn.Conv2d(hidden_dim,
                         1,
                         kernel_size=kernel_size,
                         padding=kernel_size // 2)
     self.hidden_dim = hidden_dim
     self.attention_maps = None
     init_modules(self.modules(), init='normal')
Example #5
0
    def __init__(self,
                 vocab,
                 feature_dim=[3, 64, 64],
                 stem_dim=128,
                 module_dim=128,
                 stem_num_layers=2,
                 stem_batchnorm=True,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_padding=None,
                 stem_feature_dim=24,
                 stem_subsample_layers=None,
                 classifier_fc_layers=(1024, ),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 rnn_hidden_dim=128,
                 **kwargs):
        super().__init__()

        # initialize stem
        self.stem = build_stem(feature_dim[0],
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               subsample_layers=stem_subsample_layers)
        tmp = self.stem(Variable(torch.zeros([1] + feature_dim)))
        _, F, H, W = tmp.size()

        # initialize classifier
        # TODO(mnoukhov): fix this for >1 layer RNN
        question_dim = rnn_hidden_dim
        image_dim = F * H * W
        num_answers = len(vocab['answer_idx_to_token'])
        self.classifier = build_classifier(image_dim + question_dim, 1, 1,
                                           num_answers, classifier_fc_layers,
                                           None, None, classifier_batchnorm,
                                           classifier_dropout)

        init_modules(self.modules())
Example #6
0
    def __init__(self,
                 vocab,
                 rnn_wordvec_dim=300,
                 rnn_dim=256,
                 rnn_num_layers=2,
                 rnn_dropout=0,
                 cnn_feat_dim=(1024, 14, 14),
                 stacked_attn_dim=512,
                 num_stacked_attn=2,
                 fc_use_batchnorm=False,
                 fc_dropout=0,
                 fc_dims=(1024, )):
        super(CnnLstmSaModel, self).__init__()
        rnn_kwargs = {
            'token_to_idx': vocab['question_token_to_idx'],
            'wordvec_dim': rnn_wordvec_dim,
            'rnn_dim': rnn_dim,
            'rnn_num_layers': rnn_num_layers,
            'rnn_dropout': rnn_dropout,
        }
        self.rnn = LstmEncoder(**rnn_kwargs)

        C, H, W = cnn_feat_dim
        self.image_proj = nn.Conv2d(C, rnn_dim, kernel_size=1, padding=0)
        self.stacked_attns = []
        for i in range(num_stacked_attn):
            sa = StackedAttention(rnn_dim, stacked_attn_dim)
            self.stacked_attns.append(sa)
            self.add_module('stacked-attn-%d' % i, sa)

        classifier_args = {
            'input_dim': rnn_dim,
            'hidden_dims': fc_dims,
            'output_dim': len(vocab['answer_token_to_idx']),
            'use_batchnorm': fc_use_batchnorm,
            'dropout': fc_dropout,
        }
        self.classifier = build_mlp(**classifier_args)
        init_modules(self.modules(), init='normal')
Example #7
0
    def __init__(
            self,
            vocab,
            feature_dim=(1024, 14, 14),
            stem_num_layers=2,
            stem_batchnorm=False,
            stem_kernel_size=3,
            stem_subsample_layers=None,
            stem_stride=1,
            stem_padding=None,
            stem_dim=64,
            num_modules=4,
            module_num_layers=1,
            module_dim=128,
            module_residual=True,
            module_intermediate_batchnorm=False,
            module_batchnorm=False,
            module_batchnorm_affine=False,
            module_dropout=0,
            module_input_proj=1,
            module_kernel_size=3,
            classifier_proj_dim=512,
            classifier_downsample='maxpool2',
            classifier_fc_layers=(1024, ),
            classifier_batchnorm=False,
            classifier_dropout=0,
            condition_method='bn-film',
            condition_pattern=[],
            use_gamma=True,
            use_beta=True,
            use_coords=1,
            debug_every=float('inf'),
            print_verbose_every=float('inf'),
            verbose=True,
    ):
        super(FiLMedNet, self).__init__()

        num_answers = len(vocab['answer_idx_to_token'])

        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False

        self.num_modules = num_modules
        self.module_num_layers = module_num_layers
        self.module_batchnorm = module_batchnorm
        self.module_dim = module_dim
        self.condition_method = condition_method
        self.use_gamma = use_gamma
        self.use_beta = use_beta
        self.use_coords_freq = use_coords
        self.debug_every = debug_every
        self.print_verbose_every = print_verbose_every

        # Initialize helper variables
        self.stem_use_coords = (stem_stride
                                == 1) and (self.use_coords_freq > 0)
        self.condition_pattern = condition_pattern
        if len(condition_pattern) == 0:
            self.condition_pattern = []
            for i in range(self.module_num_layers * self.num_modules):
                self.condition_pattern.append(
                    self.condition_method != 'concat')
        else:
            self.condition_pattern = [i > 0 for i in self.condition_pattern]
        self.extra_channel_freq = self.use_coords_freq
        self.block = FiLMedResBlock
        self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0
        self.fwd_count = 0
        self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0
        if self.debug_every <= -1:
            self.print_verbose_every = 1

        # Initialize stem
        stem_feature_dim = feature_dim[
            0] + self.stem_use_coords * self.num_extra_channels
        self.stem = build_stem(stem_feature_dim,
                               stem_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               subsample_layers=stem_subsample_layers)
        tmp = self.stem(
            Variable(
                torch.zeros(
                    [1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)

        self.stem_coords = coord_map((feature_dim[1], feature_dim[2]))
        self.coords = coord_map((module_H, module_W))
        self.default_weight = torch.ones(1, 1, self.module_dim).to(device)
        self.default_bias = torch.zeros(1, 1, self.module_dim).to(device)

        # Initialize FiLMed network body
        self.function_modules = {}
        self.vocab = vocab
        for fn_num in range(self.num_modules):
            with_cond = self.condition_pattern[self.module_num_layers *
                                               fn_num:self.module_num_layers *
                                               (fn_num + 1)]
            mod = self.block(
                module_dim,
                with_residual=module_residual,
                with_intermediate_batchnorm=module_intermediate_batchnorm,
                with_batchnorm=module_batchnorm,
                with_cond=with_cond,
                dropout=module_dropout,
                num_extra_channels=self.num_extra_channels,
                extra_channel_freq=self.extra_channel_freq,
                with_input_proj=module_input_proj,
                num_cond_maps=self.num_cond_maps,
                kernel_size=module_kernel_size,
                batchnorm_affine=module_batchnorm_affine,
                num_layers=self.module_num_layers,
                condition_method=condition_method,
                debug_every=self.debug_every)
            self.add_module(str(fn_num), mod)
            self.function_modules[fn_num] = mod

        # Initialize output classifier
        self.classifier = build_classifier(module_dim +
                                           self.num_extra_channels,
                                           module_H,
                                           module_W,
                                           num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           classifier_downsample,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)

        init_modules(self.modules())
Example #8
0
    def __init__(
        self,
        null_token=0,
        start_token=1,
        end_token=2,
        encoder_embed=None,
        encoder_vocab_size=100,
        decoder_vocab_size=100,
        wordvec_dim=200,
        hidden_dim=512,
        rnn_num_layers=1,
        rnn_dropout=0,
        output_batchnorm=False,
        bidirectional=False,
        encoder_type='gru',
        decoder_type='linear',
        gamma_option='linear',
        gamma_baseline=1,
        num_modules=4,
        module_num_layers=1,
        module_dim=128,
        parameter_efficient=False,
        debug_every=float('inf'),
        taking_context=False,
        variational_embedding_dropout=0.,
        embedding_uniform_boundary=0.,
        use_attention=False,
    ):
        super(FiLMGen, self).__init__()

        self.use_attention = use_attention

        self.taking_context = taking_context
        if self.use_attention:
            #if we want to use attention, the full context should be computed
            self.taking_context = True
        if self.taking_context:
            #if we want to use the full context, it makes sense to use bidirectional modeling.
            bidirectional = True

        self.encoder_type = encoder_type
        self.decoder_type = decoder_type
        self.output_batchnorm = output_batchnorm
        self.bidirectional = bidirectional
        self.num_dir = 2 if self.bidirectional else 1
        self.gamma_option = gamma_option
        self.gamma_baseline = gamma_baseline
        self.num_modules = num_modules
        self.module_num_layers = module_num_layers
        self.module_dim = module_dim
        self.debug_every = debug_every
        self.NULL = null_token
        self.START = start_token
        self.END = end_token

        self.variational_embedding_dropout = variational_embedding_dropout

        if self.bidirectional:  # and not self.taking_context:
            if decoder_type != 'linear':
                raise (NotImplementedError)
            hidden_dim = (int)(hidden_dim / self.num_dir)

        self.func_list = {
            'linear': None,
            'sigmoid': F.sigmoid,
            'tanh': F.tanh,
            'exp': torch.exp,
        }

        self.cond_feat_size = 2 * self.module_dim * self.module_num_layers  # FiLM params per ResBlock
        if not parameter_efficient:  # parameter_efficient=False only used to load older trained models
            self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules

        self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim)
        self.encoder_rnn = init_rnn(self.encoder_type,
                                    wordvec_dim,
                                    hidden_dim,
                                    rnn_num_layers,
                                    dropout=rnn_dropout,
                                    bidirectional=self.bidirectional)
        self.decoder_rnn = init_rnn(self.decoder_type,
                                    hidden_dim,
                                    hidden_dim,
                                    rnn_num_layers,
                                    dropout=rnn_dropout,
                                    bidirectional=self.bidirectional)

        if self.taking_context:
            self.decoder_linear = None  #nn.Linear(2 * hidden_dim, hidden_dim)
            for n, p in self.encoder_rnn.named_parameters():
                if n.startswith('weight'): xavier_uniform_(p)
                elif n.startswith('bias'): constant_(p, 0.)
        else:
            self.decoder_linear = nn.Linear(
                hidden_dim * self.num_dir,
                self.num_modules * self.cond_feat_size)

        if self.use_attention:
            # Florian Strub used Tanh here, but let's use identity to make this model
            # closer to the baseline film version
            #Need to change this if we want a different mechanism to compute attention weights
            attention_dim = self.module_dim
            self.context2key = nn.Linear(hidden_dim * self.num_dir,
                                         self.module_dim)
            # to transform control vector to film coefficients
            self.last_vector2key = []
            self.decoders_att = []
            for i in range(num_modules):
                mod = nn.Linear(hidden_dim * self.num_dir, attention_dim)
                self.add_module("last_vector2key{}".format(i), mod)
                self.last_vector2key.append(mod)
                mod = nn.Linear(hidden_dim * self.num_dir, 2 * self.module_dim)
                self.add_module("decoders_att{}".format(i), mod)
                self.decoders_att.append(mod)

        if self.output_batchnorm:
            self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True)

        init_modules(self.modules())
        if embedding_uniform_boundary > 0.:
            uniform_(self.encoder_embed.weight,
                     -1. * embedding_uniform_boundary,
                     embedding_uniform_boundary)

        # The attention scores will be saved here if the attention is used.
        self.scores = None
Example #9
0
    def __init__(self,
                 vocab,
                 feature_dim=(3, 64, 64),
                 stem_num_layers=2,
                 stem_batchnorm=True,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_padding=None,
                 stem_dim=24,
                 module_num_layers=1,
                 module_dim=128,
                 classifier_fc_layers=(1024,),
                 classifier_batchnorm=False,
                 classifier_dropout=0,
                 rnn_hidden_dim=128,
                 # unused
                 stem_subsample_layers=[],
                 module_input_proj=None,
                 module_residual=None,
                 module_kernel_size=None,
                 module_batchnorm=None,
                 classifier_proj_dim=None,
                 classifier_downsample=None,
                 debug_every=float('inf'),
                 print_verbose_every=float('inf'),
                 verbose=True):
        super().__init__()

        # initialize stem
        self.stem = build_stem(feature_dim[0],
                               stem_dim,
                               stem_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding,
                               subsample_layers=stem_subsample_layers)
        tmp = self.stem(Variable(torch.zeros([1, feature_dim[0], feature_dim[1], feature_dim[2]])))
        module_H = tmp.size(2)
        module_W = tmp.size(3)

        # initialize coordinates to be appended to "objects"
        # can be switched to using torch.meshgrid after 0.4.1
        x = torch.linspace(-1, 1, steps=module_W)
        y = torch.linspace(-1, 1, steps=module_H)
        xv = x.unsqueeze(1).repeat(1, module_H)
        yv = y.unsqueeze(0).repeat(module_W, 1)
        coords = torch.stack([xv,yv], dim=2).view(-1, 2)
        self.coords = Variable(coords.to(device))

        # initialize relation model
        # (output of stem + 2 coordinates) * 2 objects + question vector
        relation_modules = [nn.Linear((stem_dim + 2)*2 + rnn_hidden_dim, module_dim)]
        for _ in range(module_num_layers - 1):
            relation_modules.append(nn.Linear(module_dim, module_dim))
        self.relation = nn.Sequential(*relation_modules)

        # initialize classifier (f_theta)
        num_answers = len(vocab['answer_idx_to_token'])
        self.classifier = build_classifier(module_dim,
                                          1,
                                          1,
                                          num_answers,
                                          classifier_fc_layers,
                                          classifier_proj_dim,
                                          classifier_downsample,
                                          classifier_batchnorm,
                                          classifier_dropout)

        init_modules(self.modules())
Example #10
0
    def __init__(
            self,
            vocab,
            feature_dim=(1024, 14, 14),
            stem_num_layers=2,
            stem_batchnorm=False,
            stem_kernel_size=3,
            stem_stride=1,
            stem_padding=None,
            num_modules=4,
            module_num_layers=1,
            module_dim=128,
            module_residual=True,
            module_batchnorm=False,
            module_batchnorm_affine=False,
            module_dropout=0,
            module_input_proj=1,
            module_kernel_size=3,
            classifier_proj_dim=512,
            classifier_downsample='maxpool2',
            classifier_fc_layers=(1024, ),
            classifier_batchnorm=False,
            classifier_dropout=0,
            condition_method='bn-film',
            condition_pattern=[],
            use_gamma=True,
            use_beta=True,
            use_coords=1,

            # for Language part:
            null_token=0,
            start_token=1,
            end_token=2,
            encoder_embed=None,
            encoder_vocab_size=100,
            decoder_vocab_size=100,
            wordvec_dim=200,
            hidden_dim=512,
            rnn_num_layers=1,
            rnn_dropout=0,
            rnn_time_step=None,
            output_batchnorm=False,
            bidirectional=False,
            encoder_type='gru',
            decoder_type='linear',
            gamma_option='linear',
            gamma_baseline=1,
            parameter_efficient=False,
            debug_every=float('inf'),
            print_verbose_every=float('inf'),
            verbose=True,
    ):
        super(FiLMedNet, self).__init__()

        self.vocab = vocab
        num_answers = len(vocab['answer_idx_to_token'])

        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False
        # for image part
        self.num_modules = num_modules
        self.module_num_layers = module_num_layers
        self.module_batchnorm = module_batchnorm
        self.module_dim = module_dim  # 128
        self.condition_method = condition_method
        self.use_gamma = use_gamma
        self.use_beta = use_beta
        self.use_coords_freq = use_coords  # == 1
        self.feature_dim = feature_dim

        # for language part
        self.encoder_type = encoder_type
        self.decoder_type = decoder_type
        self.output_batchnorm = output_batchnorm
        self.bidirectional = bidirectional
        self.rnn_time_step = rnn_time_step
        self.hidden_dim = hidden_dim
        self.num_dir = 2 if self.bidirectional else 1
        self.gamma_option = gamma_option
        self.gamma_baseline = gamma_baseline  # =1
        self.debug_every = debug_every
        self.NULL = null_token
        self.START = start_token
        self.END = end_token

        self.debug_every = debug_every
        self.print_verbose_every = print_verbose_every

        # initialize rnn
        if self.bidirectional:  # yes
            if decoder_type != 'linear':
                raise (NotImplementedError)
            hidden_dim = (int)(hidden_dim / self.num_dir)

        self.func_list = {
            'linear': None,
            'sigmoid': F.sigmoid,
            'tanh': F.tanh,
            'exp': torch.exp,
            'relu': F.relu
        }

        self.cond_feat_size = 2 * self.module_dim * self.module_num_layers  # FiLM params per ResBlock
        if not parameter_efficient:  # parameter_efficient=False only used to load older trained models
            self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules

        self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim)
        self.encoder_rnn = init_rnn(self.encoder_type,
                                    wordvec_dim,
                                    hidden_dim,
                                    rnn_num_layers,
                                    dropout=rnn_dropout,
                                    bidirectional=self.bidirectional)

        # Initialize stem for rnn
        self.use_rnn_stem = False
        self.stem_rnn_size = int(256 / 2)
        if self.use_rnn_stem:
            self.stem_rnn = init_rnn(self.encoder_type,
                                     self.hidden_dim,
                                     self.stem_rnn_size,
                                     rnn_num_layers,
                                     dropout=rnn_dropout,
                                     bidirectional=self.bidirectional)
            self.hidden_dim = self.stem_rnn_size * 2
            hidden_dim = self.stem_rnn_size

        self.condition_block = {}
        for fn_num in range(self.num_modules):
            mod = nn.Linear(hidden_dim * self.num_dir,
                            self.cond_feat_size)  # gamma, beta for each block.
            self.condition_block[fn_num] = mod
            self.add_module('condition_block_' + str(fn_num), mod)

        # build sentence conditioning for each module:
        self.condition_rnn = {}
        self.cond_rnn_pool = False
        self.modulewise_cond = False
        self.cond_cnn_proj = True
        self.cond_rnn_pool_size = 3
        self.cond_rnn_flatten = Flatten()

        if self.cond_rnn_pool:
            self.cond_rnn_dim_in = self.module_dim * np.floor(
                (feature_dim[1] - self.cond_rnn_pool_size) /
                self.cond_rnn_pool_size +
                1) * np.floor((feature_dim[1] - self.cond_rnn_pool_size) /
                              self.cond_rnn_pool_size + 1)
            self.cond_rnn_dim_in = int(self.cond_rnn_dim_in)
        else:
            self.cond_rnn_dim_in = self.module_dim * feature_dim[
                1] * feature_dim[2]

        self.full_pooling = nn.MaxPool2d(kernel_size=self.cond_rnn_pool_size,
                                         padding=0)

        if self.output_batchnorm:
            self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True)

        # Initialize helper variables
        self.stem_use_coords = (stem_stride
                                == 1) and (self.use_coords_freq > 0)
        self.condition_pattern = condition_pattern
        if len(condition_pattern) == 0:
            self.condition_pattern = []
            for i in range(self.module_num_layers * self.num_modules):
                self.condition_pattern.append(
                    self.condition_method != 'concat')
        else:
            self.condition_pattern = [i > 0 for i in self.condition_pattern]
        self.extra_channel_freq = self.use_coords_freq
        self.block = FiLMedResBlock

        self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0
        self.fwd_count = 0
        self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0  # == 2
        if self.debug_every <= -1:
            self.print_verbose_every = 1
        module_H = feature_dim[1] // (stem_stride**stem_num_layers)
        module_W = feature_dim[2] // (stem_stride**stem_num_layers)
        self.coords = coord_map(
            (module_H,
             module_W))  # size(2,module_H, module_W) expanded linspace.
        self.default_weight = Parameter(
            torch.ones(1, 1, self.module_dim).type(torch.cuda.FloatTensor),
            requires_grad=False)  # to instead film, not used.
        self.default_bias = Parameter(torch.zeros(1, 1, self.module_dim).type(
            torch.cuda.FloatTensor),
                                      requires_grad=False)  # not used.

        # Initialize stem
        stem_feature_dim = feature_dim[
            0] + self.stem_use_coords * self.num_extra_channels  # 1024 + 2
        self.stem = build_stem(
            stem_feature_dim,
            module_dim,
            num_layers=stem_num_layers,
            with_batchnorm=stem_batchnorm,
            kernel_size=stem_kernel_size,
            stride=stem_stride,
            padding=stem_padding
        )  # stem_batchnorm == 1, kernel_size=3, stride=1, padding=None
        # stem: 1-layer CNN converting 1026 channels into 128 channels.

        # Initialize FiLMed network body
        for fn_num in range(self.num_modules):
            with_cond = self.condition_pattern[self.module_num_layers *
                                               fn_num:self.module_num_layers *
                                               (fn_num + 1)]
            mod = self.block(
                module_dim,
                with_residual=module_residual,
                with_batchnorm=module_batchnorm,
                with_cond=with_cond,
                dropout=module_dropout,  # 0e-2
                num_extra_channels=self.num_extra_channels,
                extra_channel_freq=self.extra_channel_freq,
                with_input_proj=module_input_proj,
                num_cond_maps=self.num_cond_maps,
                kernel_size=module_kernel_size,
                batchnorm_affine=module_batchnorm_affine,
                num_layers=self.module_num_layers,
                condition_method=condition_method,
                debug_every=self.debug_every)
            self.add_module('block_' + str(fn_num), mod)

        for fn_num in range(self.num_modules):
            mem = LangMemBlock(self.hidden_dim)
            self.add_module('lang_mem_' + str(fn_num), mem)

        # proj rnn hidden state to common latent space.
        self.rnn_proj = nn.Linear(hidden_dim * self.num_dir,
                                  classifier_proj_dim)
        self.cnn_proj = build_cnn_proj(module_dim + self.num_extra_channels,
                                       module_H,
                                       module_W,
                                       classifier_proj_dim,
                                       classifier_downsample,
                                       with_batchnorm=classifier_batchnorm,
                                       dropout=classifier_dropout)

        # cond_proj_out_dim = self.hidden_dim if self.att_mode == 'simple' else 2 * self.hidden_dim
        cond_proj_out_dim = self.hidden_dim

        if self.cond_cnn_proj:
            if self.modulewise_cond == False:
                self.cnn_proj_for_cond = build_cnn_proj(
                    module_dim + self.num_extra_channels,
                    module_H,
                    module_W,
                    cond_proj_out_dim,
                    classifier_downsample,
                    with_batchnorm=classifier_batchnorm,
                    dropout=classifier_dropout)
            else:
                for fn_num in range(self.num_modules):
                    mod = build_cnn_proj(module_dim + self.num_extra_channels,
                                         module_H,
                                         module_W,
                                         cond_proj_out_dim,
                                         classifier_downsample,
                                         with_batchnorm=classifier_batchnorm,
                                         dropout=classifier_dropout)
                    self.add_module('cnn_proj_for_cond_' + str(fn_num), mod)
        else:
            for fn_num in range(self.num_modules):
                mod = nn.Linear(self.cond_rnn_dim_in, 2 *
                                self.hidden_dim)  # gamma, beta for bi-direct
                self.condition_rnn[fn_num] = mod
                self.add_module('condition_rnn_' + str(fn_num), mod)

        # Initialize output classifier
        self.classifier = build_classifier(num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)

        init_modules(self.modules())
Example #11
0
    def __init__(self,
                 null_token=0,
                 start_token=1,
                 end_token=2,
                 encoder_embed=None,
                 encoder_vocab_size=100,
                 decoder_vocab_size=100,
                 wordvec_dim=200,
                 hidden_dim=512,
                 rnn_num_layers=1,
                 rnn_dropout=0,
                 output_batchnorm=False,
                 bidirectional=False,
                 encoder_type='gru',
                 decoder_type='linear',
                 gamma_option='linear',
                 gamma_baseline=1,
                 num_modules=4,
                 module_num_layers=1,
                 module_dim=128,
                 parameter_efficient=False,
                 debug_every=float('inf'),
                 use_bert=False):
        super(FiLMGen, self).__init__()
        self.encoder_type = encoder_type
        self.decoder_type = decoder_type
        self.output_batchnorm = output_batchnorm
        self.bidirectional = bidirectional
        self.num_dir = 2 if self.bidirectional else 1
        self.gamma_option = gamma_option
        self.gamma_baseline = gamma_baseline
        self.num_modules = num_modules
        self.module_num_layers = module_num_layers
        self.module_dim = module_dim
        self.debug_every = debug_every
        self.NULL = null_token
        self.START = start_token
        self.END = end_token
        if self.bidirectional:
            if decoder_type != 'linear':
                raise (NotImplementedError)
            hidden_dim = (int)(hidden_dim / self.num_dir)

        self.func_list = {
            'linear': None,
            'sigmoid': F.sigmoid,
            'tanh': F.tanh,
            'exp': torch.exp,
        }

        self.cond_feat_size = 2 * self.module_dim * self.module_num_layers  # FiLM params per ResBlock
        if not parameter_efficient:  # parameter_efficient=False only used to load older trained models
            self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules

        self.use_bert = use_bert
        if use_bert:
            self.bert = BertModel.from_pretrained("bert-base-uncased")
            self.bert_proj = nn.Linear(768, wordvec_dim)
        self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim)
        self.encoder_rnn = init_rnn(self.encoder_type,
                                    wordvec_dim,
                                    hidden_dim,
                                    rnn_num_layers,
                                    dropout=rnn_dropout,
                                    bidirectional=self.bidirectional)
        self.decoder_rnn = init_rnn(self.decoder_type,
                                    hidden_dim,
                                    hidden_dim,
                                    rnn_num_layers,
                                    dropout=rnn_dropout,
                                    bidirectional=self.bidirectional)
        self.decoder_linear = nn.Linear(hidden_dim * self.num_dir,
                                        self.num_modules * self.cond_feat_size)
        if self.output_batchnorm:
            self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True)

        init_modules(self.modules())
Example #12
0
    def __init__(self,
                 vocab,
                 rnn_wordvec_dim=300,
                 rnn_dim=256,
                 rnn_num_layers=2,
                 rnn_dropout=0,
                 feature_dim=(1024, 14, 14),
                 stem_module_dim=128,
                 stem_use_resnet=False,
                 stem_resnet_fixed=False,
                 resnet_model_stage=3,
                 stem_num_layers=2,
                 stem_batchnorm=False,
                 stem_kernel_size=3,
                 stem_stride=1,
                 stem_stride2_freq=0,
                 stem_padding=None,
                 use_coords=0,
                 film=False,
                 stacked_attn_dim=512,
                 num_stacked_attn=2,
                 sa_kernel_size=1,
                 fc_use_batchnorm=False,
                 fc_dropout=0,
                 fc_dims=(1024, )):
        super(CnnLstmSaModel, self).__init__()
        rnn_kwargs = {
            'token_to_idx': vocab['question_token_to_idx'],
            'wordvec_dim': rnn_wordvec_dim,
            'rnn_dim': rnn_dim,
            'rnn_num_layers': rnn_num_layers,
            'rnn_dropout': rnn_dropout,
        }
        self.rnn = LstmEncoder(**rnn_kwargs)

        self.stem = build_stem(stem_use_resnet,
                               stem_resnet_fixed,
                               feature_dim[0],
                               stem_module_dim,
                               resnet_model_stage=resnet_model_stage,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               stride2_freq=stem_stride2_freq,
                               padding=stem_padding)

        if stem_stride2_freq > 0:
            module_H = feature_dim[1] // (2**(stem_num_layers //
                                              stem_stride2_freq))
            module_W = feature_dim[2] // (2**(stem_num_layers //
                                              stem_stride2_freq))
        else:
            module_H = feature_dim[1]
            module_W = feature_dim[2]

        if use_coords == 1:
            self.coords = coord_map((module_H, module_W))
        else:
            use_coords = 0
            self.coords = None

        self.image_proj = nn.Conv2d(stem_module_dim,
                                    stacked_attn_dim - 2 * use_coords,
                                    kernel_size=1,
                                    padding=0)
        self.ques_proj = nn.Linear(rnn_dim, stacked_attn_dim)
        self.stacked_attns = []
        for i in range(num_stacked_attn):
            sa = StackedAttention(stacked_attn_dim,
                                  stacked_attn_dim,
                                  kernel_size=sa_kernel_size,
                                  film=film)
            # sa = StackedAttention(rnn_dim, stacked_attn_dim)
            self.stacked_attns.append(sa)
            self.add_module('stacked-attn-%d' % i, sa)

        self.classifier = build_classifier(module_C=stacked_attn_dim,
                                           module_H=None,
                                           module_W=None,
                                           num_answers=len(
                                               vocab['answer_token_to_idx']),
                                           fc_dims=fc_dims,
                                           proj_dim=None,
                                           downsample=None,
                                           with_batchnorm=fc_use_batchnorm,
                                           dropout=fc_dropout)
        init_modules(self.modules(), init='normal')
Example #13
0
    def __init__(
            self,
            vocab,
            feature_dim=(1024, 14, 14),
            stem_num_layers=2,
            stem_batchnorm=False,
            stem_kernel_size=3,
            stem_stride=1,
            stem_padding=None,
            num_modules=4,
            max_program_module_arity=2,
            max_program_tree_depth=5,
            module_num_layers=1,
            module_dim=128,
            module_residual=True,
            module_batchnorm=False,
            module_batchnorm_affine=False,
            module_dropout=0,
            module_input_proj=1,
            module_kernel_size=3,
            classifier_proj_dim=512,
            classifier_downsample='maxpool2',
            classifier_fc_layers=(1024, ),
            classifier_batchnorm=False,
            classifier_dropout=0,
            condition_method='bn-film',
            condition_pattern=[],
            use_gamma=True,
            use_beta=True,
            use_coords=1,
            debug_every=float('inf'),
            print_verbose_every=float('inf'),
            verbose=True,
    ):
        super(TFiLMedNet, self).__init__()

        num_answers = len(vocab['answer_idx_to_token'])

        self.stem_times = []
        self.module_times = []
        self.classifier_times = []
        self.timing = False

        self.num_modules = num_modules

        self.max_program_module_arity = max_program_module_arity
        self.max_program_tree_depth = max_program_tree_depth

        self.module_num_layers = module_num_layers
        self.module_batchnorm = module_batchnorm
        self.module_dim = module_dim
        self.condition_method = condition_method
        self.use_gamma = use_gamma
        self.use_beta = use_beta
        self.use_coords_freq = use_coords
        self.debug_every = debug_every
        self.print_verbose_every = print_verbose_every

        # Initialize helper variables
        self.stem_use_coords = (stem_stride
                                == 1) and (self.use_coords_freq > 0)
        self.condition_pattern = condition_pattern
        if len(condition_pattern) == 0:
            self.condition_pattern = []

            for i in range(self.max_program_tree_depth):
                idepth = []
                for j in range(self.max_program_module_arity):
                    ijarity = [[self.condition_method != 'concat'] * 2
                               ] * self.module_num_layers
                    idepth.append(ijarity)
                self.condition_pattern.append(idepth)

        else:
            for i in range(self.max_program_tree_depth):
                for j in range(self.max_program_module_arity):
                    self.condition_pattern[i][j] = [
                        k > 0 for k in self.condition_pattern[i][j]
                    ]
        self.extra_channel_freq = self.use_coords_freq
        #self.block = FiLMedResBlock
        self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0
        self.fwd_count = 0
        self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0
        if self.debug_every <= -1:
            self.print_verbose_every = 1
        module_H = feature_dim[1] // (stem_stride**stem_num_layers
                                      )  # Rough calc: work for main cases
        module_W = feature_dim[2] // (stem_stride**stem_num_layers
                                      )  # Rough calc: work for main cases
        self.coords = coord_map((module_H, module_W))
        self.default_weight = Variable(torch.ones(1, 1, self.module_dim)).type(
            torch.cuda.FloatTensor)
        self.default_bias = Variable(torch.zeros(1, 1, self.module_dim)).type(
            torch.cuda.FloatTensor)

        # Initialize stem
        stem_feature_dim = feature_dim[
            0] + self.stem_use_coords * self.num_extra_channels
        self.stem = build_stem(stem_feature_dim,
                               module_dim,
                               num_layers=stem_num_layers,
                               with_batchnorm=stem_batchnorm,
                               kernel_size=stem_kernel_size,
                               stride=stem_stride,
                               padding=stem_padding)

        # Initialize FiLMed network body
        self.function_modules = {}
        self.vocab = vocab
        #for fn_num in range(self.num_modules):

        mod = ResidualBlock(module_dim,
                            with_residual=module_residual,
                            with_batchnorm=module_batchnorm)
        self.add_module('0', mod)
        self.function_modules['0'] = mod

        for dep in range(self.max_program_tree_depth):
            for art in range(self.max_program_module_arity):
                with_cond = self.condition_pattern[dep][art]
                if art == 0:
                    mod = ResidualBlock(module_dim,
                                        with_residual=module_residual,
                                        with_batchnorm=module_batchnorm)
                else:
                    mod = ConcatBlock(art + 1,
                                      module_dim,
                                      with_residual=module_residual,
                                      with_batchnorm=module_batchnorm)
                ikey = str(dep + 1) + '-' + str(art + 1)
                self.add_module(ikey, mod)
                self.function_modules[ikey] = mod

        # Initialize output classifier
        self.classifier = build_classifier(module_dim +
                                           self.num_extra_channels,
                                           module_H,
                                           module_W,
                                           num_answers,
                                           classifier_fc_layers,
                                           classifier_proj_dim,
                                           classifier_downsample,
                                           with_batchnorm=classifier_batchnorm,
                                           dropout=classifier_dropout)

        init_modules(self.modules())