def __init__(self, vocab, rnn_wordvec_dim=300, rnn_dim=256, rnn_num_layers=2, rnn_dropout=0, fc_use_batchnorm=False, fc_dropout=0, fc_dims=(1024, )): super(LstmModel, self).__init__() rnn_kwargs = { 'token_to_idx': vocab['question_token_to_idx'], 'wordvec_dim': rnn_wordvec_dim, 'rnn_dim': rnn_dim, 'rnn_num_layers': rnn_num_layers, 'rnn_dropout': rnn_dropout, } self.rnn = LstmEncoder(**rnn_kwargs) self.classifier = build_classifier(module_C=rnn_dim, module_H=None, module_W=None, num_answers=len( vocab['answer_token_to_idx']), fc_dims=fc_dims, proj_dim=None, downsample=None, with_batchnorm=fc_use_batchnorm, dropout=fc_dropout)
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_module_dim=128, stem_use_resnet=False, stem_resnet_fixed=False, resnet_model_stage=3, stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_stride2_freq=0, stem_padding=None, fc_dims=(1024, ), fc_use_batchnorm=False, fc_dropout=0): super(CnnModel, self).__init__() self.stem = build_stem(stem_use_resnet, stem_resnet_fixed, feature_dim[0], stem_module_dim, resnet_model_stage=resnet_model_stage, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, stride2_freq=stem_stride2_freq, padding=stem_padding) if stem_stride2_freq > 0: module_H = feature_dim[1] // (2**(stem_num_layers // stem_stride2_freq)) module_W = feature_dim[2] // (2**(stem_num_layers // stem_stride2_freq)) else: module_H = feature_dim[1] module_W = feature_dim[2] self.conv = nn.Conv2d(stem_module_dim, stem_module_dim, kernel_size=1, padding=0) self.pool = nn.MaxPool2d(kernel_size=(module_H, module_W), stride=(module_H, module_W)) self.classifier = build_classifier(module_C=stem_module_dim, module_H=None, module_W=None, num_answers=len( vocab['answer_token_to_idx']), fc_dims=fc_dims, proj_dim=None, downsample=None, with_batchnorm=fc_use_batchnorm, dropout=fc_dropout)
def __init__(self, vocab, feature_dim=[3, 64, 64], stem_dim=128, module_dim=128, stem_num_layers=2, stem_batchnorm=True, stem_kernel_size=3, stem_stride=1, stem_padding=None, stem_feature_dim=24, stem_subsample_layers=None, classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, rnn_hidden_dim=128, **kwargs): super().__init__() # initialize stem self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding, subsample_layers=stem_subsample_layers) tmp = self.stem(Variable(torch.zeros([1] + feature_dim))) _, F, H, W = tmp.size() # initialize classifier # TODO(mnoukhov): fix this for >1 layer RNN question_dim = rnn_hidden_dim image_dim = F * H * W num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(image_dim + question_dim, 1, 1, num_answers, classifier_fc_layers, None, None, classifier_batchnorm, classifier_dropout) init_modules(self.modules())
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, module_dim=128, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(feature_dim[0], module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm) if verbose: print('Here is my stem:') print(self.stem) num_answers = len(vocab['answer_idx_to_token']) module_H, module_W = feature_dim[1], feature_dim[2] self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab for fn_str in vocab['program_token_to_idx']: num_inputs = vr.programs.get_num_inputs(fn_str) self.function_modules_num_inputs[fn_str] = num_inputs if fn_str == 'scene' or num_inputs == 1: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: mod = ConcatBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.function_modules[fn_str] = mod self.save_module_outputs = False
def __init__( self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_subsample_layers=None, stem_stride=1, stem_padding=None, stem_dim=64, num_modules=4, module_num_layers=1, module_dim=128, module_residual=True, module_intermediate_batchnorm=False, module_batchnorm=False, module_batchnorm_affine=False, module_dropout=0, module_input_proj=1, module_kernel_size=3, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, condition_method='bn-film', condition_pattern=[], use_gamma=True, use_beta=True, use_coords=1, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True, ): super(FiLMedNet, self).__init__() num_answers = len(vocab['answer_idx_to_token']) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.num_modules = num_modules self.module_num_layers = module_num_layers self.module_batchnorm = module_batchnorm self.module_dim = module_dim self.condition_method = condition_method self.use_gamma = use_gamma self.use_beta = use_beta self.use_coords_freq = use_coords self.debug_every = debug_every self.print_verbose_every = print_verbose_every # Initialize helper variables self.stem_use_coords = (stem_stride == 1) and (self.use_coords_freq > 0) self.condition_pattern = condition_pattern if len(condition_pattern) == 0: self.condition_pattern = [] for i in range(self.module_num_layers * self.num_modules): self.condition_pattern.append( self.condition_method != 'concat') else: self.condition_pattern = [i > 0 for i in self.condition_pattern] self.extra_channel_freq = self.use_coords_freq self.block = FiLMedResBlock self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0 self.fwd_count = 0 self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0 if self.debug_every <= -1: self.print_verbose_every = 1 # Initialize stem stem_feature_dim = feature_dim[ 0] + self.stem_use_coords * self.num_extra_channels self.stem = build_stem(stem_feature_dim, stem_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding, subsample_layers=stem_subsample_layers) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.stem_coords = coord_map((feature_dim[1], feature_dim[2])) self.coords = coord_map((module_H, module_W)) self.default_weight = torch.ones(1, 1, self.module_dim).to(device) self.default_bias = torch.zeros(1, 1, self.module_dim).to(device) # Initialize FiLMed network body self.function_modules = {} self.vocab = vocab for fn_num in range(self.num_modules): with_cond = self.condition_pattern[self.module_num_layers * fn_num:self.module_num_layers * (fn_num + 1)] mod = self.block( module_dim, with_residual=module_residual, with_intermediate_batchnorm=module_intermediate_batchnorm, with_batchnorm=module_batchnorm, with_cond=with_cond, dropout=module_dropout, num_extra_channels=self.num_extra_channels, extra_channel_freq=self.extra_channel_freq, with_input_proj=module_input_proj, num_cond_maps=self.num_cond_maps, kernel_size=module_kernel_size, batchnorm_affine=module_batchnorm_affine, num_layers=self.module_num_layers, condition_method=condition_method, debug_every=self.debug_every) self.add_module(str(fn_num), mod) self.function_modules[fn_num] = mod # Initialize output classifier self.classifier = build_classifier(module_dim + self.num_extra_channels, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) init_modules(self.modules())
def __init__(self, vocab, feature_dim, stem_num_layers, stem_batchnorm, stem_subsample_layers, stem_kernel_size, stem_stride, stem_padding, stem_dim, module_dim, module_kernel_size, module_input_proj, forward_func, use_color, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, use_film=False, verbose=True): super().__init__() self.module_dim = module_dim self.func = FUNC_DICT[forward_func] self.use_color = use_color self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, subsample_layers=stem_subsample_layers, kernel_size=stem_kernel_size, padding=stem_padding, with_batchnorm=stem_batchnorm) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.coords = coord_map((module_H, module_W)).unsqueeze(0) if verbose: print('Here is my stem:') print(self.stem) num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.unary_function_modules = {} self.binary_function_modules = {} self.vocab = vocab self.use_film = use_film if self.use_film: unary_mod = FiLMedResBlock( module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, # was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) binary_mod = ConcatFiLMedResBlock( 2, module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, #was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) self.unary_function_modules['film'] = unary_mod self.binary_function_modules['film'] = binary_mod self.add_module('film_unary', unary_mod) self.add_module('film_binary', binary_mod) else: for fn_str in vocab['program_token_to_idx']: arity = self.vocab['program_token_arity'][fn_str] if arity == 2 and forward_func == 'tree': binary_mod = ConcatBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, use_simple=False) self.add_module(fn_str, binary_mod) self.binary_function_modules[fn_str] = binary_mod else: mod = ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.unary_function_modules[fn_str] = mod self.declare_film_coefficients()
def __init__(self, vocab, feature_dim=(3, 64, 64), stem_num_layers=2, stem_batchnorm=True, stem_kernel_size=3, stem_stride=1, stem_padding=None, stem_dim=24, module_num_layers=1, module_dim=128, classifier_fc_layers=(1024,), classifier_batchnorm=False, classifier_dropout=0, rnn_hidden_dim=128, # unused stem_subsample_layers=[], module_input_proj=None, module_residual=None, module_kernel_size=None, module_batchnorm=None, classifier_proj_dim=None, classifier_downsample=None, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True): super().__init__() # initialize stem self.stem = build_stem(feature_dim[0], stem_dim, stem_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding, subsample_layers=stem_subsample_layers) tmp = self.stem(Variable(torch.zeros([1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) # initialize coordinates to be appended to "objects" # can be switched to using torch.meshgrid after 0.4.1 x = torch.linspace(-1, 1, steps=module_W) y = torch.linspace(-1, 1, steps=module_H) xv = x.unsqueeze(1).repeat(1, module_H) yv = y.unsqueeze(0).repeat(module_W, 1) coords = torch.stack([xv,yv], dim=2).view(-1, 2) self.coords = Variable(coords.to(device)) # initialize relation model # (output of stem + 2 coordinates) * 2 objects + question vector relation_modules = [nn.Linear((stem_dim + 2)*2 + rnn_hidden_dim, module_dim)] for _ in range(module_num_layers - 1): relation_modules.append(nn.Linear(module_dim, module_dim)) self.relation = nn.Sequential(*relation_modules) # initialize classifier (f_theta) num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, 1, 1, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, classifier_batchnorm, classifier_dropout) init_modules(self.modules())
def __init__( self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_padding=None, num_modules=4, module_num_layers=1, module_dim=128, module_residual=True, module_batchnorm=False, module_batchnorm_affine=False, module_dropout=0, module_input_proj=1, module_kernel_size=3, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, condition_method='bn-film', condition_pattern=[], use_gamma=True, use_beta=True, use_coords=1, # for Language part: null_token=0, start_token=1, end_token=2, encoder_embed=None, encoder_vocab_size=100, decoder_vocab_size=100, wordvec_dim=200, hidden_dim=512, rnn_num_layers=1, rnn_dropout=0, rnn_time_step=None, output_batchnorm=False, bidirectional=False, encoder_type='gru', decoder_type='linear', gamma_option='linear', gamma_baseline=1, parameter_efficient=False, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True, ): super(FiLMedNet, self).__init__() self.vocab = vocab num_answers = len(vocab['answer_idx_to_token']) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False # for image part self.num_modules = num_modules self.module_num_layers = module_num_layers self.module_batchnorm = module_batchnorm self.module_dim = module_dim # 128 self.condition_method = condition_method self.use_gamma = use_gamma self.use_beta = use_beta self.use_coords_freq = use_coords # == 1 self.feature_dim = feature_dim # for language part self.encoder_type = encoder_type self.decoder_type = decoder_type self.output_batchnorm = output_batchnorm self.bidirectional = bidirectional self.rnn_time_step = rnn_time_step self.hidden_dim = hidden_dim self.num_dir = 2 if self.bidirectional else 1 self.gamma_option = gamma_option self.gamma_baseline = gamma_baseline # =1 self.debug_every = debug_every self.NULL = null_token self.START = start_token self.END = end_token self.debug_every = debug_every self.print_verbose_every = print_verbose_every # initialize rnn if self.bidirectional: # yes if decoder_type != 'linear': raise (NotImplementedError) hidden_dim = (int)(hidden_dim / self.num_dir) self.func_list = { 'linear': None, 'sigmoid': F.sigmoid, 'tanh': F.tanh, 'exp': torch.exp, 'relu': F.relu } self.cond_feat_size = 2 * self.module_dim * self.module_num_layers # FiLM params per ResBlock if not parameter_efficient: # parameter_efficient=False only used to load older trained models self.cond_feat_size = 4 * self.module_dim + 2 * self.num_modules self.encoder_embed = nn.Embedding(encoder_vocab_size, wordvec_dim) self.encoder_rnn = init_rnn(self.encoder_type, wordvec_dim, hidden_dim, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) # Initialize stem for rnn self.use_rnn_stem = False self.stem_rnn_size = int(256 / 2) if self.use_rnn_stem: self.stem_rnn = init_rnn(self.encoder_type, self.hidden_dim, self.stem_rnn_size, rnn_num_layers, dropout=rnn_dropout, bidirectional=self.bidirectional) self.hidden_dim = self.stem_rnn_size * 2 hidden_dim = self.stem_rnn_size self.condition_block = {} for fn_num in range(self.num_modules): mod = nn.Linear(hidden_dim * self.num_dir, self.cond_feat_size) # gamma, beta for each block. self.condition_block[fn_num] = mod self.add_module('condition_block_' + str(fn_num), mod) # build sentence conditioning for each module: self.condition_rnn = {} self.cond_rnn_pool = False self.modulewise_cond = False self.cond_cnn_proj = True self.cond_rnn_pool_size = 3 self.cond_rnn_flatten = Flatten() if self.cond_rnn_pool: self.cond_rnn_dim_in = self.module_dim * np.floor( (feature_dim[1] - self.cond_rnn_pool_size) / self.cond_rnn_pool_size + 1) * np.floor((feature_dim[1] - self.cond_rnn_pool_size) / self.cond_rnn_pool_size + 1) self.cond_rnn_dim_in = int(self.cond_rnn_dim_in) else: self.cond_rnn_dim_in = self.module_dim * feature_dim[ 1] * feature_dim[2] self.full_pooling = nn.MaxPool2d(kernel_size=self.cond_rnn_pool_size, padding=0) if self.output_batchnorm: self.output_bn = nn.BatchNorm1d(self.cond_feat_size, affine=True) # Initialize helper variables self.stem_use_coords = (stem_stride == 1) and (self.use_coords_freq > 0) self.condition_pattern = condition_pattern if len(condition_pattern) == 0: self.condition_pattern = [] for i in range(self.module_num_layers * self.num_modules): self.condition_pattern.append( self.condition_method != 'concat') else: self.condition_pattern = [i > 0 for i in self.condition_pattern] self.extra_channel_freq = self.use_coords_freq self.block = FiLMedResBlock self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0 self.fwd_count = 0 self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0 # == 2 if self.debug_every <= -1: self.print_verbose_every = 1 module_H = feature_dim[1] // (stem_stride**stem_num_layers) module_W = feature_dim[2] // (stem_stride**stem_num_layers) self.coords = coord_map( (module_H, module_W)) # size(2,module_H, module_W) expanded linspace. self.default_weight = Parameter( torch.ones(1, 1, self.module_dim).type(torch.cuda.FloatTensor), requires_grad=False) # to instead film, not used. self.default_bias = Parameter(torch.zeros(1, 1, self.module_dim).type( torch.cuda.FloatTensor), requires_grad=False) # not used. # Initialize stem stem_feature_dim = feature_dim[ 0] + self.stem_use_coords * self.num_extra_channels # 1024 + 2 self.stem = build_stem( stem_feature_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding ) # stem_batchnorm == 1, kernel_size=3, stride=1, padding=None # stem: 1-layer CNN converting 1026 channels into 128 channels. # Initialize FiLMed network body for fn_num in range(self.num_modules): with_cond = self.condition_pattern[self.module_num_layers * fn_num:self.module_num_layers * (fn_num + 1)] mod = self.block( module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm, with_cond=with_cond, dropout=module_dropout, # 0e-2 num_extra_channels=self.num_extra_channels, extra_channel_freq=self.extra_channel_freq, with_input_proj=module_input_proj, num_cond_maps=self.num_cond_maps, kernel_size=module_kernel_size, batchnorm_affine=module_batchnorm_affine, num_layers=self.module_num_layers, condition_method=condition_method, debug_every=self.debug_every) self.add_module('block_' + str(fn_num), mod) for fn_num in range(self.num_modules): mem = LangMemBlock(self.hidden_dim) self.add_module('lang_mem_' + str(fn_num), mem) # proj rnn hidden state to common latent space. self.rnn_proj = nn.Linear(hidden_dim * self.num_dir, classifier_proj_dim) self.cnn_proj = build_cnn_proj(module_dim + self.num_extra_channels, module_H, module_W, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) # cond_proj_out_dim = self.hidden_dim if self.att_mode == 'simple' else 2 * self.hidden_dim cond_proj_out_dim = self.hidden_dim if self.cond_cnn_proj: if self.modulewise_cond == False: self.cnn_proj_for_cond = build_cnn_proj( module_dim + self.num_extra_channels, module_H, module_W, cond_proj_out_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) else: for fn_num in range(self.num_modules): mod = build_cnn_proj(module_dim + self.num_extra_channels, module_H, module_W, cond_proj_out_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) self.add_module('cnn_proj_for_cond_' + str(fn_num), mod) else: for fn_num in range(self.num_modules): mod = nn.Linear(self.cond_rnn_dim_in, 2 * self.hidden_dim) # gamma, beta for bi-direct self.condition_rnn[fn_num] = mod self.add_module('condition_rnn_' + str(fn_num), mod) # Initialize output classifier self.classifier = build_classifier(num_answers, classifier_fc_layers, classifier_proj_dim, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) init_modules(self.modules())
def __init__(self, vocab, feature_dim, module_dim, module_kernel_size, stem_dim, stem_num_layers, stem_subsample_layers, stem_kernel_size, stem_padding, stem_batchnorm, classifier_fc_layers, classifier_proj_dim, classifier_downsample,classifier_batchnorm, num_modules, hard_code_alpha=False, hard_code_tau=False, tau_init='random', alpha_init='xavier_uniform', model_type ='soft', model_bernoulli=0.5, use_module = 'conv', use_stopwords = True, **kwargs): super().__init__() self.num_modules = num_modules # alphas and taus from Overleaf Doc. self.hard_code_alpha = hard_code_alpha self.hard_code_tau = hard_code_tau num_question_tokens = 3 if alpha_init.startswith('correct'): print('using correct initialization') alpha = INITS[alpha_init](torch.Tensor(num_modules, num_question_tokens)) elif alpha_init == 'constant': alpha = INITS[alpha_init](torch.Tensor(num_modules, num_question_tokens), 1) else: alpha = INITS[alpha_init](torch.Tensor(num_modules, num_question_tokens)) print('initial alpha ') print(alpha) if hard_code_alpha: assert(alpha_init.startswith('correct')) self.alpha = Variable(alpha) self.alpha = self.alpha.to(device) else: self.alpha = nn.Parameter(alpha) # create taus if tau_init == 'tree': tau_0, tau_1 = _tree_tau() print("initializing with tree.") elif tau_init == 'chain': tau_0, tau_1 = _chain_tau() print("initializing with chain") elif tau_init == 'chain_with_shortcuts': tau_0, tau_1 = _chain_with_shortcuts_tau() print("initializing with chain and shortcuts") else: tau_0, tau_1 = _random_tau(num_modules) if hard_code_tau: assert(tau_init in ['chain', 'tree', 'chain_with_shortcuts']) self.tau_0 = Variable(tau_0) self.tau_1 = Variable(tau_1) self.tau_0 = self.tau_0.to(device) self.tau_1 = self.tau_1.to(device) else: self.tau_0 = nn.Parameter(tau_0) self.tau_1 = nn.Parameter(tau_1) if use_module == 'conv': embedding_dim_1 = module_dim + (module_dim*module_dim*module_kernel_size*module_kernel_size) embedding_dim_2 = module_dim + (2*module_dim*module_dim) question_embeddings_1 = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_1) question_embeddings_2 = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_2) stdv_1 = 1. / math.sqrt(module_dim*module_kernel_size*module_kernel_size) stdv_2 = 1. / math.sqrt(2*module_dim) question_embeddings_1.weight.data.uniform_(-stdv_1, stdv_1) question_embeddings_2.weight.data.uniform_(-stdv_2, stdv_2) self.question_embeddings = nn.Embedding(len(vocab['question_idx_to_token']), embedding_dim_1+embedding_dim_2) self.question_embeddings.weight.data = torch.cat([question_embeddings_1.weight.data, question_embeddings_2.weight.data],dim=-1) self.func = ConvFunc(module_dim, module_kernel_size) elif use_module == 'residual': embedding_dim_1 = module_dim + (module_dim*module_dim*module_kernel_size*module_kernel_size) embedding_dim_2 = module_dim + (2*module_dim*module_dim) question_embeddings_a = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_1) question_embeddings_b = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_1) question_embeddings_2 = nn.Embedding(len(vocab['question_idx_to_token']),embedding_dim_2) stdv_1 = 1. / math.sqrt(module_dim*module_kernel_size*module_kernel_size) stdv_2 = 1. / math.sqrt(2*module_dim) question_embeddings_a.weight.data.uniform_(-stdv_1, stdv_1) question_embeddings_b.weight.data.uniform_(-stdv_1, stdv_1) question_embeddings_2.weight.data.uniform_(-stdv_2, stdv_2) self.question_embeddings = nn.Embedding(len(vocab['question_idx_to_token']), 2*embedding_dim_1+embedding_dim_2) self.question_embeddings.weight.data = torch.cat([question_embeddings_a.weight.data, question_embeddings_b.weight.data, question_embeddings_2.weight.data],dim=-1) self.func = ResidualFunc(module_dim, module_kernel_size) else: self.question_embeddings = nn.Embedding(len(vocab['question_idx_to_token']), module_dim) self.func = FindModule(module_dim, module_kernel_size) # stem for processing the image into a 3D tensor self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, subsample_layers=stem_subsample_layers, kernel_size=stem_kernel_size, padding=stem_padding, with_batchnorm=stem_batchnorm) tmp = self.stem(Variable(torch.zeros([1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm) self.model_type = model_type self.use_module = use_module p = model_bernoulli tree_odds = -numpy.log((1 - p) / p) self.tree_odds = nn.Parameter(torch.Tensor([tree_odds]))
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_use_resnet=False, stem_resnet_fixed=False, resnet_model_stage=3, stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_stride2_freq=0, stem_padding=None, module_dim=128, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024,), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(stem_use_resnet, stem_resnet_fixed, feature_dim[0], module_dim, resnet_model_stage=resnet_model_stage, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, stride2_freq=stem_stride2_freq, padding=stem_padding) if verbose: print('Here is my stem:') print(self.stem) if stem_stride2_freq > 0: module_H = feature_dim[1] // (2 ** (stem_num_layers // stem_stride2_freq)) module_W = feature_dim[2] // (2 ** (stem_num_layers // stem_stride2_freq)) else: module_H = feature_dim[1] module_W = feature_dim[2] num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = dict(vocab['program_token_num_inputs']) self.vocab = vocab self.scene = None for fn_str in vocab['program_token_to_idx']: # num_inputs = vr.programs.get_num_inputs(fn_str) # self.function_modules_num_inputs[fn_str] = num_inputs num_inputs = self.function_modules_num_inputs[fn_str] if num_inputs == 0 and self.scene is None: self.scene = fn_str elif fn_str == 'scene': self.scene = fn_str if num_inputs == 0 or num_inputs == 1: # if fn_str == 'scene' or num_inputs == 1: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs >= 2: mod = ConcatBlock(num_inputs, module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.function_modules[fn_str] = mod self.save_module_outputs = False
def __init__(self, vocab, feature_dim, use_film, use_simple_block, stem_num_layers, stem_batchnorm, stem_subsample_layers, stem_kernel_size, stem_stride, stem_padding, stem_dim, module_dim, module_pool, module_use_gammas, module_kernel_size, module_input_proj, module_residual=True, module_batchnorm=False, module_num_layers=1, mod_id_loss=False, kl_loss=False, learn_control=False, rnn_dim=None, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, discriminator_proj_dim=None, discriminator_downsample=None, discriminator_fc_layers=None, discriminator_dropout=None, verbose=True, type_anonymizer=False): super(ModuleNet, self).__init__() if discriminator_proj_dim is None: discriminator_proj_dim = classifier_proj_dim if discriminator_downsample is None: discriminator_downsample = classifier_downsample if discriminator_fc_layers is None: discriminator_fc_layers = classifier_fc_layers if discriminator_dropout is None: discriminator_dropout = classifier_dropout self.module_dim = module_dim self.use_film = use_film self.use_simple_block = use_simple_block self.mod_id_loss = mod_id_loss self.kl_loss = kl_loss self.learn_control = learn_control self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, subsample_layers=stem_subsample_layers, kernel_size=stem_kernel_size, padding=stem_padding, with_batchnorm=stem_batchnorm) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.coords = coord_map((module_H, module_W)) if verbose: print('Here is my stem:') print(self.stem) classifier_kwargs = dict(module_C=module_dim, module_H=module_H, module_W=module_W, num_answers=len(vocab['answer_idx_to_token']), fc_dims=classifier_fc_layers, proj_dim=classifier_proj_dim, downsample=classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) discriminator_kwargs = dict(module_C=module_dim, module_H=module_H, module_W=module_W, num_answers=len( vocab['program_idx_to_token']), fc_dims=discriminator_fc_layers, proj_dim=discriminator_proj_dim, downsample=discriminator_downsample, with_batchnorm=False, dropout=discriminator_dropout) if self.use_film: classifier_kwargs['module_H'] = 1 classifier_kwargs['module_W'] = 1 discriminator_kwargs['module_H'] = 1 discriminator_kwargs['module_W'] = 1 self.classifier = build_classifier(**classifier_kwargs) if self.mod_id_loss: self.module_identifier = build_classifier(**discriminator_kwargs) if verbose: print('Here is my classifier:') print(self.classifier) self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab shared_block = None if type_anonymizer: shared_block = ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) elif use_film == 1: assert module_W == module_H shared_block = SharedFiLMedModule( module_dim, kernel_size=module_kernel_size, num_layers=module_num_layers, with_residual=module_residual, pool=module_pool, use_gammas=module_use_gammas, post_linear=kl_loss, learn_embeddings=not learn_control) if shared_block: self.shared_block = shared_block self.add_module('shared', shared_block) for fn_str, fn_idx in vocab['program_token_to_idx'].items(): num_inputs = vocab['program_token_arity'][fn_str] self.function_modules_num_inputs[fn_str] = num_inputs def create_module(): if num_inputs > 2: raise Exception('Not implemented!') if use_film == 1: return FiLMModule(shared_block, fn_idx) if use_film == 2: separate_core_block = SharedFiLMedModule( module_dim, module_W, kernel_size=module_kernel_size, with_residual=module_residual) return FiLMModule(separate_core_block, fn_idx) if use_simple_block: # brutally simple concatentation block # with 2 layers, no residual connection return SimpleConcatBlock(module_dim, kernel_size=module_kernel_size) if num_inputs in [0, 1]: return ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, shared_block=shared_block, post_linear=kl_loss) else: return ConcatBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, shared_block=shared_block, post_linear=kl_loss) mod = create_module() if mod is not None: self.add_module(fn_str, mod) self.function_modules[fn_str] = mod self.save_module_outputs = False self.noise_enabled = True if learn_control: self.controller = MACControl(30, rnn_dim, module_dim)
def __init__(self, vocab, rnn_wordvec_dim=300, rnn_dim=256, rnn_num_layers=2, rnn_dropout=0, feature_dim=(1024, 14, 14), stem_module_dim=128, stem_use_resnet=False, stem_resnet_fixed=False, resnet_model_stage=3, stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_stride2_freq=0, stem_padding=None, use_coords=0, film=False, stacked_attn_dim=512, num_stacked_attn=2, sa_kernel_size=1, fc_use_batchnorm=False, fc_dropout=0, fc_dims=(1024, )): super(CnnLstmSaModel, self).__init__() rnn_kwargs = { 'token_to_idx': vocab['question_token_to_idx'], 'wordvec_dim': rnn_wordvec_dim, 'rnn_dim': rnn_dim, 'rnn_num_layers': rnn_num_layers, 'rnn_dropout': rnn_dropout, } self.rnn = LstmEncoder(**rnn_kwargs) self.stem = build_stem(stem_use_resnet, stem_resnet_fixed, feature_dim[0], stem_module_dim, resnet_model_stage=resnet_model_stage, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, stride2_freq=stem_stride2_freq, padding=stem_padding) if stem_stride2_freq > 0: module_H = feature_dim[1] // (2**(stem_num_layers // stem_stride2_freq)) module_W = feature_dim[2] // (2**(stem_num_layers // stem_stride2_freq)) else: module_H = feature_dim[1] module_W = feature_dim[2] if use_coords == 1: self.coords = coord_map((module_H, module_W)) else: use_coords = 0 self.coords = None self.image_proj = nn.Conv2d(stem_module_dim, stacked_attn_dim - 2 * use_coords, kernel_size=1, padding=0) self.ques_proj = nn.Linear(rnn_dim, stacked_attn_dim) self.stacked_attns = [] for i in range(num_stacked_attn): sa = StackedAttention(stacked_attn_dim, stacked_attn_dim, kernel_size=sa_kernel_size, film=film) # sa = StackedAttention(rnn_dim, stacked_attn_dim) self.stacked_attns.append(sa) self.add_module('stacked-attn-%d' % i, sa) self.classifier = build_classifier(module_C=stacked_attn_dim, module_H=None, module_W=None, num_answers=len( vocab['answer_token_to_idx']), fc_dims=fc_dims, proj_dim=None, downsample=None, with_batchnorm=fc_use_batchnorm, dropout=fc_dropout) init_modules(self.modules(), init='normal')
def __init__(self, vocab, rnn_wordvec_dim=300, rnn_dim=256, rnn_num_layers=2, rnn_dropout=0, feature_dim=(1024, 14, 14), stem_module_dim=128, stem_use_resnet=False, stem_resnet_fixed=False, resnet_model_stage=3, stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_stride2_freq=0, stem_padding=None, use_coords=None, film=False, cl_kernel_size=1, cl_early_fusion=False, relational_module=False, rel_image_dim=24, rel_module_dim=256, rel_num_layers=4, multimodal_core=False, mc_module_dim=256, mc_num_layers=4, mc_batchnorm=True, mc_kernel_size=1, fc_dims=(1024, ), fc_use_batchnorm=False, fc_dropout=0): super(CnnLstmModel, self).__init__() if film: if relational_module: rnn_dim = 2 * stem_module_dim + 8 * use_coords elif multimodal_core: rnn_dim = 2 * stem_module_dim + 4 * use_coords else: rnn_dim = 2 * stem_module_dim rnn_kwargs = { 'token_to_idx': vocab['question_token_to_idx'], 'wordvec_dim': rnn_wordvec_dim, 'rnn_dim': rnn_dim, 'rnn_num_layers': rnn_num_layers, 'rnn_dropout': rnn_dropout, } self.rnn = LstmEncoder(**rnn_kwargs) self.stem = build_stem(stem_use_resnet, stem_resnet_fixed, feature_dim[0], stem_module_dim, resnet_model_stage=resnet_model_stage, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, stride2_freq=stem_stride2_freq, padding=stem_padding) if stem_stride2_freq > 0: module_H = feature_dim[1] // (2**(stem_num_layers // stem_stride2_freq)) module_W = feature_dim[2] // (2**(stem_num_layers // stem_stride2_freq)) else: module_H = feature_dim[1] module_W = feature_dim[2] if use_coords == 1 or (use_coords is None and relational_module): use_coords = 1 self.coords = coord_map((module_H, module_W)) else: use_coords = 0 self.coords = None if film: self.film = FiLM() else: self.film = None assert not relational_module or not multimodal_core self.relational_module = relational_module self.multimodal_core = multimodal_core if self.relational_module: # https://arxiv.org/abs/1706.01427 self.conv = nn.Conv2d(stem_module_dim, rel_image_dim, kernel_size=1, padding=0) if film: self.rel = build_relational_module( feature_dim=((rel_image_dim + 2 * use_coords) * 2), module_dim=rel_module_dim, num_layers=rel_num_layers) else: self.rel = build_relational_module( feature_dim=((rel_image_dim + 2 * use_coords) * 2 + rnn_dim), module_dim=rel_module_dim, num_layers=rel_num_layers) module_C = rel_module_dim elif self.multimodal_core: # https://arxiv.org/abs/1809.04482 if film: self.mc = build_multimodal_core(feature_dim=(stem_module_dim + 2 * use_coords), module_dim=mc_module_dim, num_layers=mc_num_layers, with_batchnorm=mc_batchnorm, kernel_size=mc_kernel_size) else: self.mc = build_multimodal_core( feature_dim=(stem_module_dim + rnn_dim + 2 * use_coords), module_dim=mc_module_dim, num_layers=mc_num_layers, with_batchnorm=mc_batchnorm, kernel_size=mc_kernel_size) module_C = mc_module_dim else: self.early_fusion = cl_early_fusion if cl_early_fusion and not film: self.conv = nn.Conv2d(stem_module_dim + 2 * use_coords + rnn_dim, stem_module_dim, kernel_size=cl_kernel_size, padding=cl_kernel_size // 2) module_C = stem_module_dim else: self.conv = nn.Conv2d(stem_module_dim + 2 * use_coords, stem_module_dim, kernel_size=cl_kernel_size, padding=cl_kernel_size // 2) if cl_early_fusion or film: module_C = stem_module_dim else: module_C = stem_module_dim + rnn_dim self.pool = nn.MaxPool2d(kernel_size=(module_H, module_W), stride=(module_H, module_W)) self.classifier = build_classifier(module_C=module_C, module_H=None, module_W=None, num_answers=len( vocab['answer_token_to_idx']), fc_dims=fc_dims, proj_dim=None, downsample=None, with_batchnorm=fc_use_batchnorm, dropout=fc_dropout)
def __init__(self, vocab, feature_dim, use_film, use_simple_block, sharing_patterns, stem_num_layers, stem_batchnorm, stem_subsample_layers, stem_kernel_size, stem_stride, stem_padding, stem_dim, module_dim, module_kernel_size, module_input_proj, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.module_dim = module_dim # should be 0 or 1 to indicate the use of film block or not (0 would bring you back to the original EE model) self.use_film = use_film # should be 0 or 1 to indicate if we are using ResNets or a simple 3x3 conv followed by ReLU self.use_simple_block = use_simple_block # this should be a list of two elements (either 0 or 1). It's only active if self.use_film == 1 # The first element of 1 indicates the sharing of CNN weights in the film blocks, 0 otheriwse # The second element of 1 indicate the sharing of film coefficient in the film blocks, 0 otherwise # so [1,0] would be sharing the CNN weights while having different film coefficients for different modules in the program self.sharing_patterns = sharing_patterns self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, subsample_layers=stem_subsample_layers, kernel_size=stem_kernel_size, padding=stem_padding, with_batchnorm=stem_batchnorm) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.coords = coord_map((module_H, module_W)) if verbose: print('Here is my stem:') print(self.stem) num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.fn_str_2_filmId = {} self.vocab = vocab for fn_str in vocab['program_token_to_idx']: num_inputs = vocab['program_token_arity'][fn_str] self.function_modules_num_inputs[fn_str] = num_inputs if self.use_film: if self.sharing_patterns[1] == 1: self.fn_str_2_filmId[fn_str] = 0 else: self.fn_str_2_filmId[fn_str] = len(self.fn_str_2_filmId) if fn_str == 'scene' or num_inputs == 1: if self.use_film: if self.sharing_patterns[0] == 1: mod = None else: mod = FiLMedResBlock( module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, # was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) else: if self.use_simple_block: mod = SimpleVisualBlock(module_dim, kernel_size=module_kernel_size) else: mod = ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: if self.use_film: if self.sharing_patterns[0] == 1: mod = None else: mod = ConcatFiLMedResBlock( 2, module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, #was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) else: mod = ConcatBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) else: raise Exception('Not implemented!') if mod is not None: self.add_module(fn_str, mod) self.function_modules[fn_str] = mod if self.use_film and self.sharing_patterns[0] == 1: mod = ConcatFiLMedResBlock( 2, module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, #was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) self.add_module('shared_film', mod) self.function_modules['shared_film'] = mod self.declare_film_coefficients() self.save_module_outputs = False
def __init__( self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_padding=None, num_modules=4, max_program_module_arity=2, max_program_tree_depth=5, module_num_layers=1, module_dim=128, module_residual=True, module_batchnorm=False, module_batchnorm_affine=False, module_dropout=0, module_input_proj=1, module_kernel_size=3, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, condition_method='bn-film', condition_pattern=[], use_gamma=True, use_beta=True, use_coords=1, debug_every=float('inf'), print_verbose_every=float('inf'), verbose=True, ): super(TFiLMedNet, self).__init__() num_answers = len(vocab['answer_idx_to_token']) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.num_modules = num_modules self.max_program_module_arity = max_program_module_arity self.max_program_tree_depth = max_program_tree_depth self.module_num_layers = module_num_layers self.module_batchnorm = module_batchnorm self.module_dim = module_dim self.condition_method = condition_method self.use_gamma = use_gamma self.use_beta = use_beta self.use_coords_freq = use_coords self.debug_every = debug_every self.print_verbose_every = print_verbose_every # Initialize helper variables self.stem_use_coords = (stem_stride == 1) and (self.use_coords_freq > 0) self.condition_pattern = condition_pattern if len(condition_pattern) == 0: self.condition_pattern = [] for i in range(self.max_program_tree_depth): idepth = [] for j in range(self.max_program_module_arity): ijarity = [[self.condition_method != 'concat'] * 2 ] * self.module_num_layers idepth.append(ijarity) self.condition_pattern.append(idepth) else: for i in range(self.max_program_tree_depth): for j in range(self.max_program_module_arity): self.condition_pattern[i][j] = [ k > 0 for k in self.condition_pattern[i][j] ] self.extra_channel_freq = self.use_coords_freq #self.block = FiLMedResBlock self.num_cond_maps = 2 * self.module_dim if self.condition_method == 'concat' else 0 self.fwd_count = 0 self.num_extra_channels = 2 if self.use_coords_freq > 0 else 0 if self.debug_every <= -1: self.print_verbose_every = 1 module_H = feature_dim[1] // (stem_stride**stem_num_layers ) # Rough calc: work for main cases module_W = feature_dim[2] // (stem_stride**stem_num_layers ) # Rough calc: work for main cases self.coords = coord_map((module_H, module_W)) self.default_weight = Variable(torch.ones(1, 1, self.module_dim)).type( torch.cuda.FloatTensor) self.default_bias = Variable(torch.zeros(1, 1, self.module_dim)).type( torch.cuda.FloatTensor) # Initialize stem stem_feature_dim = feature_dim[ 0] + self.stem_use_coords * self.num_extra_channels self.stem = build_stem(stem_feature_dim, module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, padding=stem_padding) # Initialize FiLMed network body self.function_modules = {} self.vocab = vocab #for fn_num in range(self.num_modules): mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module('0', mod) self.function_modules['0'] = mod for dep in range(self.max_program_tree_depth): for art in range(self.max_program_module_arity): with_cond = self.condition_pattern[dep][art] if art == 0: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) else: mod = ConcatBlock(art + 1, module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) ikey = str(dep + 1) + '-' + str(art + 1) self.add_module(ikey, mod) self.function_modules[ikey] = mod # Initialize output classifier self.classifier = build_classifier(module_dim + self.num_extra_channels, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) init_modules(self.modules())