def __init__(self, input_dim, hidden_dim): super(StackedAttention, self).__init__() self.Wv = nn.Conv2d(input_dim, hidden_dim, kernel_size=1, padding=0) self.Wu = nn.Linear(input_dim, hidden_dim) self.Wp = nn.Conv2d(hidden_dim, 1, kernel_size=1, padding=0) self.hidden_dim = hidden_dim self.attention_maps = None init_modules(self.modules(), init='normal')
def __init__(self, hidden_dim, with_batchnorm=True): super(LangMemBlock, self).__init__() self.hidden_dim = hidden_dim self.with_batchnorm = with_batchnorm # batch norm for rnn feature maps self.rnn_batch_norm = nn.BatchNorm1d(self.hidden_dim, affine=False) # self.att_mode = 'simple' self.att_mode = 'paramed' self.compute_attention = nn.Linear(self.hidden_dim, 1) init_modules(self.modules())
def __init__(self, in_dim, out_dim=None, with_residual=True, with_batchnorm=True, with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1, with_input_proj=0, num_cond_maps=0, kernel_size=3, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')): if out_dim is None: out_dim = in_dim super(FiLMedResBlock, self).__init__() self.with_residual = with_residual self.with_batchnorm = with_batchnorm self.with_cond = with_cond self.dropout = dropout self.extra_channel_freq = 0 if num_extra_channels == 0 else extra_channel_freq self.with_input_proj = with_input_proj # Kernel size of input projection self.num_cond_maps = num_cond_maps self.kernel_size = kernel_size self.batchnorm_affine = batchnorm_affine self.num_layers = num_layers self.condition_method = condition_method self.debug_every = debug_every if self.with_input_proj % 2 == 0: raise(NotImplementedError) if self.kernel_size % 2 == 0: raise(NotImplementedError) if self.num_layers >= 2: raise(NotImplementedError) if self.condition_method == 'block-input-film' and self.with_cond[0]: self.film = FiLM() if self.with_input_proj: self.input_proj = nn.Conv2d(in_dim + (num_extra_channels if self.extra_channel_freq >= 1 else 0), in_dim, kernel_size=self.with_input_proj, padding=self.with_input_proj // 2) self.conv1 = nn.Conv2d(in_dim + self.num_cond_maps + (num_extra_channels if self.extra_channel_freq >= 2 else 0), out_dim, kernel_size=self.kernel_size, padding=self.kernel_size // 2) if self.condition_method == 'conv-film' and self.with_cond[0]: self.film = FiLM() if self.with_batchnorm: self.bn1 = nn.BatchNorm2d(out_dim, affine=((not self.with_cond[0]) or self.batchnorm_affine)) if self.condition_method == 'bn-film' and self.with_cond[0]: self.film = FiLM() if dropout > 0: self.drop = nn.Dropout2d(p=self.dropout) if ((self.condition_method == 'relu-film' or self.condition_method == 'block-output-film') and self.with_cond[0]): self.film = FiLM() init_modules(self.modules())
def __init__(self, input_dim, hidden_dim, kernel_size=1, film=False): super(StackedAttention, self).__init__() self.Wv = nn.Conv2d(input_dim, hidden_dim, kernel_size=1, padding=0) # self.Wv = nn.Conv2d(hidden_dim, hidden_dim, kernel_size=1, padding=0) # self.Wu = nn.Linear(input_dim, hidden_dim) if film: self.Wu = nn.Linear(input_dim, 2 * hidden_dim) self.film = FiLM() else: self.Wu = nn.Linear(input_dim, hidden_dim) self.film = None self.Wp = nn.Conv2d(hidden_dim, 1, kernel_size=kernel_size, padding=kernel_size // 2) self.hidden_dim = hidden_dim self.attention_maps = None init_modules(self.modules(), init='normal')
def __init__(self, vocab, feature_dim=[3, 64, 64], stem_dim=128, module_dim=128, stem_num_layers=2, stem_batchnorm=True, stem_kernel_size=3, stem_stride=1, stem_padding=None, stem_feature_dim=24, stem_subsample_layers=None, classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, rnn_hidden_dim=128, **kwargs): super().__init__() # initialize stem self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding, subsample_layers=stem_subsample_layers) tmp = self.stem(Variable(torch.zeros([1] + feature_dim))) _, F, H, W = tmp.size() # initialize classifier # TODO(mnoukhov): fix this for >1 layer RNN question_dim = rnn_hidden_dim image_dim = F * H * W num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(image_dim + question_dim, 1, 1, num_answers, classifier_fc_layers, None, None, classifier_batchnorm, classifier_dropout) init_modules(self.modules())
def __init__(self, vocab, rnn_wordvec_dim=300, rnn_dim=256, rnn_num_layers=2, rnn_dropout=0, cnn_feat_dim=(1024, 14, 14), stacked_attn_dim=512, num_stacked_attn=2, fc_use_batchnorm=False, fc_dropout=0, fc_dims=(1024, )): super(CnnLstmSaModel, self).__init__() rnn_kwargs = { 'token_to_idx': vocab['question_token_to_idx'], 'wordvec_dim': rnn_wordvec_dim, 'rnn_dim': rnn_dim, 'rnn_num_layers': rnn_num_layers, 'rnn_dropout': rnn_dropout, } self.rnn = LstmEncoder(**rnn_kwargs) C, H, W = cnn_feat_dim self.image_proj = nn.Conv2d(C, rnn_dim, kernel_size=1, padding=0) self.stacked_attns = [] for i in range(num_stacked_attn): sa = StackedAttention(rnn_dim, stacked_attn_dim) self.stacked_attns.append(sa) self.add_module('stacked-attn-%d' % i, sa) classifier_args = { 'input_dim': rnn_dim, 'hidden_dims': fc_dims, 'output_dim': len(vocab['answer_token_to_idx']), 'use_batchnorm': fc_use_batchnorm, 'dropout': fc_dropout, } self.classifier = build_mlp(**classifier_args) init_modules(self.modules(), init='normal')
def __init__( self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_subsample_layers=None, stem_stride=1, stem_padding=None, stem_dim=64, num_modules=4, module_num_layers=1, module_dim=128, module_residual=True, module_intermediate_batchnorm=False, module_batchnorm=False, module_batchnorm_affine=False, module_dropout=0, module_input_proj=1, module_kernel_size=3, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, condition_method='bn-film', condition_pattern=[], use_gamma=True, use_beta=True, use_coords=1, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True, ): super(FiLMedNet, self).__init__() num_answers = len(vocab['answer_idx_to_token']) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.num_modules = num_modules self.module_num_layers = module_num_layers self.module_batchnorm = module_batchnorm self.module_dim = module_dim self.condition_method = condition_method self.use_gamma = use_gamma self.use_beta = use_beta self.use_coords_freq = use_coords self.debug_every = debug_every self.print_verbose_every = print_verbose_every # Initialize helper variables self.stem_use_coords = (stem_stride == 1) and (self.use_coords_freq > 0) self.condition_pattern = condition_pattern if len(condition_pattern) == 0: self.condition_pattern = [] for i in range(self.module_num_layers * self.num_modules): self.condition_pattern.append( self.condition_method != 'concat') else: self.condition_pattern = [i > 0 for i in self.condition_pattern] self.extra_channel_freq = self.use_coords_freq self.block = FiLMedResBlock self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0 self.fwd_count = 0 self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0 if self.debug_every <= -1: self.print_verbose_every = 1 # Initialize stem stem_feature_dim = feature_dim[ 0] + self.stem_use_coords * self.num_extra_channels self.stem = build_stem(stem_feature_dim, stem_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding, subsample_layers=stem_subsample_layers) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.stem_coords = coord_map((feature_dim[1], feature_dim[2])) self.coords = coord_map((module_H, module_W)) self.default_weight = torch.ones(1, 1, self.module_dim).to(device) self.default_bias = torch.zeros(1, 1, self.module_dim).to(device) # Initialize FiLMed network body self.function_modules = {} self.vocab = vocab for fn_num in range(self.num_modules): with_cond = self.condition_pattern[self.module_num_layers * fn_num:self.module_num_layers * (fn_num + 1)] mod = self.block( module_dim, with_residual=module_residual, with_intermediate_batchnorm=module_intermediate_batchnorm, with_batchnorm=module_batchnorm, with_cond=with_cond, dropout=module_dropout, num_extra_channels=self.num_extra_channels, extra_channel_freq=self.extra_channel_freq, with_input_proj=module_input_proj, num_cond_maps=self.num_cond_maps, kernel_size=module_kernel_size, batchnorm_affine=module_batchnorm_affine, num_layers=self.module_num_layers, condition_method=condition_method, debug_every=self.debug_every) self.add_module(str(fn_num), mod) self.function_modules[fn_num] = mod # Initialize output classifier self.classifier = build_classifier(module_dim + self.num_extra_channels, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) init_modules(self.modules())
def __init__( self, null_token=0, start_token=1, end_token=2, encoder_embed=None, encoder_vocab_size=100, decoder_vocab_size=100, wordvec_dim=200, hidden_dim=512, rnn_num_layers=1, rnn_dropout=0, output_batchnorm=False, bidirectional=False, encoder_type='gru', decoder_type='linear', gamma_option='linear', gamma_baseline=1, num_modules=4, module_num_layers=1, module_dim=128, parameter_efficient=False, debug_every=float('inf'), taking_context=False, variational_embedding_dropout=0., embedding_uniform_boundary=0., use_attention=False, ): super(FiLMGen, self).__init__() self.use_attention = use_attention self.taking_context = taking_context if self.use_attention: #if we want to use attention, the full context should be computed self.taking_context = True if self.taking_context: #if we want to use the full context, it makes sense to use bidirectional modeling. bidirectional = True self.encoder_type = encoder_type self.decoder_type = decoder_type self.output_batchnorm = output_batchnorm self.bidirectional = bidirectional self.num_dir = 2 if self.bidirectional else 1 self.gamma_option = gamma_option self.gamma_baseline = gamma_baseline self.num_modules = num_modules self.module_num_layers = module_num_layers self.module_dim = module_dim self.debug_every = debug_every self.NULL = null_token self.START = start_token self.END = end_token self.variational_embedding_dropout = variational_embedding_dropout if self.bidirectional: # and not self.taking_context: if decoder_type != 'linear': raise (NotImplementedError) hidden_dim = (int)(hidden_dim / self.num_dir) self.func_list = { 'linear': None, 'sigmoid': F.sigmoid, 'tanh': F.tanh, 'exp': torch.exp, } self.cond_feat_size = 2 * self.module_dim * self.module_num_layers # FiLM params per ResBlock if not parameter_efficient: # parameter_efficient=False only used to load older trained models self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim) self.encoder_rnn = init_rnn(self.encoder_type, wordvec_dim, hidden_dim, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) self.decoder_rnn = init_rnn(self.decoder_type, hidden_dim, hidden_dim, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) if self.taking_context: self.decoder_linear = None #nn.Linear(2 * hidden_dim, hidden_dim) for n, p in self.encoder_rnn.named_parameters(): if n.startswith('weight'): xavier_uniform_(p) elif n.startswith('bias'): constant_(p, 0.) else: self.decoder_linear = nn.Linear( hidden_dim * self.num_dir, self.num_modules * self.cond_feat_size) if self.use_attention: # Florian Strub used Tanh here, but let's use identity to make this model # closer to the baseline film version #Need to change this if we want a different mechanism to compute attention weights attention_dim = self.module_dim self.context2key = nn.Linear(hidden_dim * self.num_dir, self.module_dim) # to transform control vector to film coefficients self.last_vector2key = [] self.decoders_att = [] for i in range(num_modules): mod = nn.Linear(hidden_dim * self.num_dir, attention_dim) self.add_module("last_vector2key{}".format(i), mod) self.last_vector2key.append(mod) mod = nn.Linear(hidden_dim * self.num_dir, 2 * self.module_dim) self.add_module("decoders_att{}".format(i), mod) self.decoders_att.append(mod) if self.output_batchnorm: self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True) init_modules(self.modules()) if embedding_uniform_boundary > 0.: uniform_(self.encoder_embed.weight, -1. * embedding_uniform_boundary, embedding_uniform_boundary) # The attention scores will be saved here if the attention is used. self.scores = None
def __init__(self, vocab, feature_dim=(3, 64, 64), stem_num_layers=2, stem_batchnorm=True, stem_kernel_size=3, stem_stride=1, stem_padding=None, stem_dim=24, module_num_layers=1, module_dim=128, classifier_fc_layers=(1024,), classifier_batchnorm=False, classifier_dropout=0, rnn_hidden_dim=128, # unused stem_subsample_layers=[], module_input_proj=None, module_residual=None, module_kernel_size=None, module_batchnorm=None, classifier_proj_dim=None, classifier_downsample=None, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True): super().__init__() # initialize stem self.stem = build_stem(feature_dim[0], stem_dim, stem_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding, subsample_layers=stem_subsample_layers) tmp = self.stem(Variable(torch.zeros([1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) # initialize coordinates to be appended to "objects" # can be switched to using torch.meshgrid after 0.4.1 x = torch.linspace(-1, 1, steps=module_W) y = torch.linspace(-1, 1, steps=module_H) xv = x.unsqueeze(1).repeat(1, module_H) yv = y.unsqueeze(0).repeat(module_W, 1) coords = torch.stack([xv,yv], dim=2).view(-1, 2) self.coords = Variable(coords.to(device)) # initialize relation model # (output of stem + 2 coordinates) * 2 objects + question vector relation_modules = [nn.Linear((stem_dim + 2)*2 + rnn_hidden_dim, module_dim)] for _ in range(module_num_layers - 1): relation_modules.append(nn.Linear(module_dim, module_dim)) self.relation = nn.Sequential(*relation_modules) # initialize classifier (f_theta) num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, 1, 1, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, classifier_batchnorm, classifier_dropout) init_modules(self.modules())
def __init__( self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_padding=None, num_modules=4, module_num_layers=1, module_dim=128, module_residual=True, module_batchnorm=False, module_batchnorm_affine=False, module_dropout=0, module_input_proj=1, module_kernel_size=3, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, condition_method='bn-film', condition_pattern=[], use_gamma=True, use_beta=True, use_coords=1, # for Language part: null_token=0, start_token=1, end_token=2, encoder_embed=None, encoder_vocab_size=100, decoder_vocab_size=100, wordvec_dim=200, hidden_dim=512, rnn_num_layers=1, rnn_dropout=0, rnn_time_step=None, output_batchnorm=False, bidirectional=False, encoder_type='gru', decoder_type='linear', gamma_option='linear', gamma_baseline=1, parameter_efficient=False, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True, ): super(FiLMedNet, self).__init__() self.vocab = vocab num_answers = len(vocab['answer_idx_to_token']) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False # for image part self.num_modules = num_modules self.module_num_layers = module_num_layers self.module_batchnorm = module_batchnorm self.module_dim = module_dim # 128 self.condition_method = condition_method self.use_gamma = use_gamma self.use_beta = use_beta self.use_coords_freq = use_coords # == 1 self.feature_dim = feature_dim # for language part self.encoder_type = encoder_type self.decoder_type = decoder_type self.output_batchnorm = output_batchnorm self.bidirectional = bidirectional self.rnn_time_step = rnn_time_step self.hidden_dim = hidden_dim self.num_dir = 2 if self.bidirectional else 1 self.gamma_option = gamma_option self.gamma_baseline = gamma_baseline # =1 self.debug_every = debug_every self.NULL = null_token self.START = start_token self.END = end_token self.debug_every = debug_every self.print_verbose_every = print_verbose_every # initialize rnn if self.bidirectional: # yes if decoder_type != 'linear': raise (NotImplementedError) hidden_dim = (int)(hidden_dim / self.num_dir) self.func_list = { 'linear': None, 'sigmoid': F.sigmoid, 'tanh': F.tanh, 'exp': torch.exp, 'relu': F.relu } self.cond_feat_size = 2 * self.module_dim * self.module_num_layers # FiLM params per ResBlock if not parameter_efficient: # parameter_efficient=False only used to load older trained models self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim) self.encoder_rnn = init_rnn(self.encoder_type, wordvec_dim, hidden_dim, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) # Initialize stem for rnn self.use_rnn_stem = False self.stem_rnn_size = int(256 / 2) if self.use_rnn_stem: self.stem_rnn = init_rnn(self.encoder_type, self.hidden_dim, self.stem_rnn_size, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) self.hidden_dim = self.stem_rnn_size * 2 hidden_dim = self.stem_rnn_size self.condition_block = {} for fn_num in range(self.num_modules): mod = nn.Linear(hidden_dim * self.num_dir, self.cond_feat_size) # gamma, beta for each block. self.condition_block[fn_num] = mod self.add_module('condition_block_' + str(fn_num), mod) # build sentence conditioning for each module: self.condition_rnn = {} self.cond_rnn_pool = False self.modulewise_cond = False self.cond_cnn_proj = True self.cond_rnn_pool_size = 3 self.cond_rnn_flatten = Flatten() if self.cond_rnn_pool: self.cond_rnn_dim_in = self.module_dim * np.floor( (feature_dim[1] - self.cond_rnn_pool_size) / self.cond_rnn_pool_size + 1) * np.floor((feature_dim[1] - self.cond_rnn_pool_size) / self.cond_rnn_pool_size + 1) self.cond_rnn_dim_in = int(self.cond_rnn_dim_in) else: self.cond_rnn_dim_in = self.module_dim * feature_dim[ 1] * feature_dim[2] self.full_pooling = nn.MaxPool2d(kernel_size=self.cond_rnn_pool_size, padding=0) if self.output_batchnorm: self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True) # Initialize helper variables self.stem_use_coords = (stem_stride == 1) and (self.use_coords_freq > 0) self.condition_pattern = condition_pattern if len(condition_pattern) == 0: self.condition_pattern = [] for i in range(self.module_num_layers * self.num_modules): self.condition_pattern.append( self.condition_method != 'concat') else: self.condition_pattern = [i > 0 for i in self.condition_pattern] self.extra_channel_freq = self.use_coords_freq self.block = FiLMedResBlock self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0 self.fwd_count = 0 self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0 # == 2 if self.debug_every <= -1: self.print_verbose_every = 1 module_H = feature_dim[1] // (stem_stride**stem_num_layers) module_W = feature_dim[2] // (stem_stride**stem_num_layers) self.coords = coord_map( (module_H, module_W)) # size(2,module_H, module_W) expanded linspace. self.default_weight = Parameter( torch.ones(1, 1, self.module_dim).type(torch.cuda.FloatTensor), requires_grad=False) # to instead film, not used. self.default_bias = Parameter(torch.zeros(1, 1, self.module_dim).type( torch.cuda.FloatTensor), requires_grad=False) # not used. # Initialize stem stem_feature_dim = feature_dim[ 0] + self.stem_use_coords * self.num_extra_channels # 1024 + 2 self.stem = build_stem( stem_feature_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding ) # stem_batchnorm == 1, kernel_size=3, stride=1, padding=None # stem: 1-layer CNN converting 1026 channels into 128 channels. # Initialize FiLMed network body for fn_num in range(self.num_modules): with_cond = self.condition_pattern[self.module_num_layers * fn_num:self.module_num_layers * (fn_num + 1)] mod = self.block( module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm, with_cond=with_cond, dropout=module_dropout, # 0e-2 num_extra_channels=self.num_extra_channels, extra_channel_freq=self.extra_channel_freq, with_input_proj=module_input_proj, num_cond_maps=self.num_cond_maps, kernel_size=module_kernel_size, batchnorm_affine=module_batchnorm_affine, num_layers=self.module_num_layers, condition_method=condition_method, debug_every=self.debug_every) self.add_module('block_' + str(fn_num), mod) for fn_num in range(self.num_modules): mem = LangMemBlock(self.hidden_dim) self.add_module('lang_mem_' + str(fn_num), mem) # proj rnn hidden state to common latent space. self.rnn_proj = nn.Linear(hidden_dim * self.num_dir, classifier_proj_dim) self.cnn_proj = build_cnn_proj(module_dim + self.num_extra_channels, module_H, module_W, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) # cond_proj_out_dim = self.hidden_dim if self.att_mode == 'simple' else 2 * self.hidden_dim cond_proj_out_dim = self.hidden_dim if self.cond_cnn_proj: if self.modulewise_cond == False: self.cnn_proj_for_cond = build_cnn_proj( module_dim + self.num_extra_channels, module_H, module_W, cond_proj_out_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) else: for fn_num in range(self.num_modules): mod = build_cnn_proj(module_dim + self.num_extra_channels, module_H, module_W, cond_proj_out_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) self.add_module('cnn_proj_for_cond_' + str(fn_num), mod) else: for fn_num in range(self.num_modules): mod = nn.Linear(self.cond_rnn_dim_in, 2 * self.hidden_dim) # gamma, beta for bi-direct self.condition_rnn[fn_num] = mod self.add_module('condition_rnn_' + str(fn_num), mod) # Initialize output classifier self.classifier = build_classifier(num_answers, classifier_fc_layers, classifier_proj_dim, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) init_modules(self.modules())
def __init__(self, null_token=0, start_token=1, end_token=2, encoder_embed=None, encoder_vocab_size=100, decoder_vocab_size=100, wordvec_dim=200, hidden_dim=512, rnn_num_layers=1, rnn_dropout=0, output_batchnorm=False, bidirectional=False, encoder_type='gru', decoder_type='linear', gamma_option='linear', gamma_baseline=1, num_modules=4, module_num_layers=1, module_dim=128, parameter_efficient=False, debug_every=float('inf'), use_bert=False): super(FiLMGen, self).__init__() self.encoder_type = encoder_type self.decoder_type = decoder_type self.output_batchnorm = output_batchnorm self.bidirectional = bidirectional self.num_dir = 2 if self.bidirectional else 1 self.gamma_option = gamma_option self.gamma_baseline = gamma_baseline self.num_modules = num_modules self.module_num_layers = module_num_layers self.module_dim = module_dim self.debug_every = debug_every self.NULL = null_token self.START = start_token self.END = end_token if self.bidirectional: if decoder_type != 'linear': raise (NotImplementedError) hidden_dim = (int)(hidden_dim / self.num_dir) self.func_list = { 'linear': None, 'sigmoid': F.sigmoid, 'tanh': F.tanh, 'exp': torch.exp, } self.cond_feat_size = 2 * self.module_dim * self.module_num_layers # FiLM params per ResBlock if not parameter_efficient: # parameter_efficient=False only used to load older trained models self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules self.use_bert = use_bert if use_bert: self.bert = BertModel.from_pretrained("bert-base-uncased") self.bert_proj = nn.Linear(768, wordvec_dim) self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim) self.encoder_rnn = init_rnn(self.encoder_type, wordvec_dim, hidden_dim, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) self.decoder_rnn = init_rnn(self.decoder_type, hidden_dim, hidden_dim, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) self.decoder_linear = nn.Linear(hidden_dim * self.num_dir, self.num_modules * self.cond_feat_size) if self.output_batchnorm: self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True) init_modules(self.modules())
def __init__(self, vocab, rnn_wordvec_dim=300, rnn_dim=256, rnn_num_layers=2, rnn_dropout=0, feature_dim=(1024, 14, 14), stem_module_dim=128, stem_use_resnet=False, stem_resnet_fixed=False, resnet_model_stage=3, stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_stride2_freq=0, stem_padding=None, use_coords=0, film=False, stacked_attn_dim=512, num_stacked_attn=2, sa_kernel_size=1, fc_use_batchnorm=False, fc_dropout=0, fc_dims=(1024, )): super(CnnLstmSaModel, self).__init__() rnn_kwargs = { 'token_to_idx': vocab['question_token_to_idx'], 'wordvec_dim': rnn_wordvec_dim, 'rnn_dim': rnn_dim, 'rnn_num_layers': rnn_num_layers, 'rnn_dropout': rnn_dropout, } self.rnn = LstmEncoder(**rnn_kwargs) self.stem = build_stem(stem_use_resnet, stem_resnet_fixed, feature_dim[0], stem_module_dim, resnet_model_stage=resnet_model_stage, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, stride2_freq=stem_stride2_freq, padding=stem_padding) if stem_stride2_freq > 0: module_H = feature_dim[1] // (2**(stem_num_layers // stem_stride2_freq)) module_W = feature_dim[2] // (2**(stem_num_layers // stem_stride2_freq)) else: module_H = feature_dim[1] module_W = feature_dim[2] if use_coords == 1: self.coords = coord_map((module_H, module_W)) else: use_coords = 0 self.coords = None self.image_proj = nn.Conv2d(stem_module_dim, stacked_attn_dim - 2 * use_coords, kernel_size=1, padding=0) self.ques_proj = nn.Linear(rnn_dim, stacked_attn_dim) self.stacked_attns = [] for i in range(num_stacked_attn): sa = StackedAttention(stacked_attn_dim, stacked_attn_dim, kernel_size=sa_kernel_size, film=film) # sa = StackedAttention(rnn_dim, stacked_attn_dim) self.stacked_attns.append(sa) self.add_module('stacked-attn-%d' % i, sa) self.classifier = build_classifier(module_C=stacked_attn_dim, module_H=None, module_W=None, num_answers=len( vocab['answer_token_to_idx']), fc_dims=fc_dims, proj_dim=None, downsample=None, with_batchnorm=fc_use_batchnorm, dropout=fc_dropout) init_modules(self.modules(), init='normal')
def __init__( self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_padding=None, num_modules=4, max_program_module_arity=2, max_program_tree_depth=5, module_num_layers=1, module_dim=128, module_residual=True, module_batchnorm=False, module_batchnorm_affine=False, module_dropout=0, module_input_proj=1, module_kernel_size=3, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, condition_method='bn-film', condition_pattern=[], use_gamma=True, use_beta=True, use_coords=1, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True, ): super(TFiLMedNet, self).__init__() num_answers = len(vocab['answer_idx_to_token']) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.num_modules = num_modules self.max_program_module_arity = max_program_module_arity self.max_program_tree_depth = max_program_tree_depth self.module_num_layers = module_num_layers self.module_batchnorm = module_batchnorm self.module_dim = module_dim self.condition_method = condition_method self.use_gamma = use_gamma self.use_beta = use_beta self.use_coords_freq = use_coords self.debug_every = debug_every self.print_verbose_every = print_verbose_every # Initialize helper variables self.stem_use_coords = (stem_stride == 1) and (self.use_coords_freq > 0) self.condition_pattern = condition_pattern if len(condition_pattern) == 0: self.condition_pattern = [] for i in range(self.max_program_tree_depth): idepth = [] for j in range(self.max_program_module_arity): ijarity = [[self.condition_method != 'concat'] * 2 ] * self.module_num_layers idepth.append(ijarity) self.condition_pattern.append(idepth) else: for i in range(self.max_program_tree_depth): for j in range(self.max_program_module_arity): self.condition_pattern[i][j] = [ k > 0 for k in self.condition_pattern[i][j] ] self.extra_channel_freq = self.use_coords_freq #self.block = FiLMedResBlock self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0 self.fwd_count = 0 self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0 if self.debug_every <= -1: self.print_verbose_every = 1 module_H = feature_dim[1] // (stem_stride**stem_num_layers ) # Rough calc: work for main cases module_W = feature_dim[2] // (stem_stride**stem_num_layers ) # Rough calc: work for main cases self.coords = coord_map((module_H, module_W)) self.default_weight = Variable(torch.ones(1, 1, self.module_dim)).type( torch.cuda.FloatTensor) self.default_bias = Variable(torch.zeros(1, 1, self.module_dim)).type( torch.cuda.FloatTensor) # Initialize stem stem_feature_dim = feature_dim[ 0] + self.stem_use_coords * self.num_extra_channels self.stem = build_stem(stem_feature_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding) # Initialize FiLMed network body self.function_modules = {} self.vocab = vocab #for fn_num in range(self.num_modules): mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module('0', mod) self.function_modules['0'] = mod for dep in range(self.max_program_tree_depth): for art in range(self.max_program_module_arity): with_cond = self.condition_pattern[dep][art] if art == 0: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) else: mod = ConcatBlock(art + 1, module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) ikey = str(dep + 1) + '-' + str(art + 1) self.add_module(ikey, mod) self.function_modules[ikey] = mod # Initialize output classifier self.classifier = build_classifier(module_dim + self.num_extra_channels, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) init_modules(self.modules())