def create_module(): if num_inputs > 2: raise Exception('Not implemented!') if use_film == 1: return FiLMModule(shared_block, fn_idx) if use_film == 2: separate_core_block = SharedFiLMedModule( module_dim, module_W, kernel_size=module_kernel_size, with_residual=module_residual) return FiLMModule(separate_core_block, fn_idx) if use_simple_block: # brutally simple concatentation block # with 2 layers, no residual connection return SimpleConcatBlock(module_dim, kernel_size=module_kernel_size) if num_inputs in [0, 1]: return ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, shared_block=shared_block, post_linear=kl_loss) else: return ConcatBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, shared_block=shared_block, post_linear=kl_loss)
def build_cnn(feat_dim=(1024, 14, 14), res_block_dim=128, num_res_blocks=0, proj_dim=512, pooling='maxpool2'): C, H, W = feat_dim layers = [] if num_res_blocks > 0: layers.append(nn.Conv2d(C, res_block_dim, kernel_size=3, padding=1)) layers.append(nn.ReLU(inplace=True)) C = res_block_dim for _ in range(num_res_blocks): layers.append(ResidualBlock(C)) if proj_dim > 0: layers.append(nn.Conv2d(C, proj_dim, kernel_size=1, padding=0)) layers.append(nn.ReLU(inplace=True)) C = proj_dim if pooling == 'maxpool2': layers.append(nn.MaxPool2d(kernel_size=2, stride=2)) H, W = H // 2, W // 2 return nn.Sequential(*layers), (C, H, W)
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, module_dim=128, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(feature_dim[0], module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm) if verbose: print('Here is my stem:') print(self.stem) num_answers = len(vocab['answer_idx_to_token']) module_H, module_W = feature_dim[1], feature_dim[2] self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab for fn_str in vocab['program_token_to_idx']: num_inputs = vr.programs.get_num_inputs(fn_str) self.function_modules_num_inputs[fn_str] = num_inputs if fn_str == 'scene' or num_inputs == 1: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: mod = ConcatBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.function_modules[fn_str] = mod self.save_module_outputs = False
def __init__(self, vocab, feature_dim, stem_num_layers, stem_batchnorm, stem_subsample_layers, stem_kernel_size, stem_stride, stem_padding, stem_dim, module_dim, module_kernel_size, module_input_proj, forward_func, use_color, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, use_film=False, verbose=True): super().__init__() self.module_dim = module_dim self.func = FUNC_DICT[forward_func] self.use_color = use_color self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, subsample_layers=stem_subsample_layers, kernel_size=stem_kernel_size, padding=stem_padding, with_batchnorm=stem_batchnorm) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.coords = coord_map((module_H, module_W)).unsqueeze(0) if verbose: print('Here is my stem:') print(self.stem) num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.unary_function_modules = {} self.binary_function_modules = {} self.vocab = vocab self.use_film = use_film if self.use_film: unary_mod = FiLMedResBlock( module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, # was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) binary_mod = ConcatFiLMedResBlock( 2, module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, #was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) self.unary_function_modules['film'] = unary_mod self.binary_function_modules['film'] = binary_mod self.add_module('film_unary', unary_mod) self.add_module('film_binary', binary_mod) else: for fn_str in vocab['program_token_to_idx']: arity = self.vocab['program_token_arity'][fn_str] if arity == 2 and forward_func == 'tree': binary_mod = ConcatBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, use_simple=False) self.add_module(fn_str, binary_mod) self.binary_function_modules[fn_str] = binary_mod else: mod = ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.unary_function_modules[fn_str] = mod self.declare_film_coefficients()
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_use_resnet=False, stem_resnet_fixed=False, resnet_model_stage=3, stem_num_layers=2, stem_batchnorm=False, stem_kernel_size=3, stem_stride=1, stem_stride2_freq=0, stem_padding=None, module_dim=128, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024,), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(stem_use_resnet, stem_resnet_fixed, feature_dim[0], module_dim, resnet_model_stage=resnet_model_stage, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm, kernel_size=stem_kernel_size, stride=stem_stride, stride2_freq=stem_stride2_freq, padding=stem_padding) if verbose: print('Here is my stem:') print(self.stem) if stem_stride2_freq > 0: module_H = feature_dim[1] // (2 ** (stem_num_layers // stem_stride2_freq)) module_W = feature_dim[2] // (2 ** (stem_num_layers // stem_stride2_freq)) else: module_H = feature_dim[1] module_W = feature_dim[2] num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = dict(vocab['program_token_num_inputs']) self.vocab = vocab self.scene = None for fn_str in vocab['program_token_to_idx']: # num_inputs = vr.programs.get_num_inputs(fn_str) # self.function_modules_num_inputs[fn_str] = num_inputs num_inputs = self.function_modules_num_inputs[fn_str] if num_inputs == 0 and self.scene is None: self.scene = fn_str elif fn_str == 'scene': self.scene = fn_str if num_inputs == 0 or num_inputs == 1: # if fn_str == 'scene' or num_inputs == 1: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs >= 2: mod = ConcatBlock(num_inputs, module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.function_modules[fn_str] = mod self.save_module_outputs = False
def __init__(self, vocab, feature_dim, use_film, use_simple_block, stem_num_layers, stem_batchnorm, stem_subsample_layers, stem_kernel_size, stem_stride, stem_padding, stem_dim, module_dim, module_pool, module_use_gammas, module_kernel_size, module_input_proj, module_residual=True, module_batchnorm=False, module_num_layers=1, mod_id_loss=False, kl_loss=False, learn_control=False, rnn_dim=None, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, discriminator_proj_dim=None, discriminator_downsample=None, discriminator_fc_layers=None, discriminator_dropout=None, verbose=True, type_anonymizer=False): super(ModuleNet, self).__init__() if discriminator_proj_dim is None: discriminator_proj_dim = classifier_proj_dim if discriminator_downsample is None: discriminator_downsample = classifier_downsample if discriminator_fc_layers is None: discriminator_fc_layers = classifier_fc_layers if discriminator_dropout is None: discriminator_dropout = classifier_dropout self.module_dim = module_dim self.use_film = use_film self.use_simple_block = use_simple_block self.mod_id_loss = mod_id_loss self.kl_loss = kl_loss self.learn_control = learn_control self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, subsample_layers=stem_subsample_layers, kernel_size=stem_kernel_size, padding=stem_padding, with_batchnorm=stem_batchnorm) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.coords = coord_map((module_H, module_W)) if verbose: print('Here is my stem:') print(self.stem) classifier_kwargs = dict(module_C=module_dim, module_H=module_H, module_W=module_W, num_answers=len(vocab['answer_idx_to_token']), fc_dims=classifier_fc_layers, proj_dim=classifier_proj_dim, downsample=classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) discriminator_kwargs = dict(module_C=module_dim, module_H=module_H, module_W=module_W, num_answers=len( vocab['program_idx_to_token']), fc_dims=discriminator_fc_layers, proj_dim=discriminator_proj_dim, downsample=discriminator_downsample, with_batchnorm=False, dropout=discriminator_dropout) if self.use_film: classifier_kwargs['module_H'] = 1 classifier_kwargs['module_W'] = 1 discriminator_kwargs['module_H'] = 1 discriminator_kwargs['module_W'] = 1 self.classifier = build_classifier(**classifier_kwargs) if self.mod_id_loss: self.module_identifier = build_classifier(**discriminator_kwargs) if verbose: print('Here is my classifier:') print(self.classifier) self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab shared_block = None if type_anonymizer: shared_block = ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) elif use_film == 1: assert module_W == module_H shared_block = SharedFiLMedModule( module_dim, kernel_size=module_kernel_size, num_layers=module_num_layers, with_residual=module_residual, pool=module_pool, use_gammas=module_use_gammas, post_linear=kl_loss, learn_embeddings=not learn_control) if shared_block: self.shared_block = shared_block self.add_module('shared', shared_block) for fn_str, fn_idx in vocab['program_token_to_idx'].items(): num_inputs = vocab['program_token_arity'][fn_str] self.function_modules_num_inputs[fn_str] = num_inputs def create_module(): if num_inputs > 2: raise Exception('Not implemented!') if use_film == 1: return FiLMModule(shared_block, fn_idx) if use_film == 2: separate_core_block = SharedFiLMedModule( module_dim, module_W, kernel_size=module_kernel_size, with_residual=module_residual) return FiLMModule(separate_core_block, fn_idx) if use_simple_block: # brutally simple concatentation block # with 2 layers, no residual connection return SimpleConcatBlock(module_dim, kernel_size=module_kernel_size) if num_inputs in [0, 1]: return ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, shared_block=shared_block, post_linear=kl_loss) else: return ConcatBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm, shared_block=shared_block, post_linear=kl_loss) mod = create_module() if mod is not None: self.add_module(fn_str, mod) self.function_modules[fn_str] = mod self.save_module_outputs = False self.noise_enabled = True if learn_control: self.controller = MACControl(30, rnn_dim, module_dim)
def __init__(self, vocab, feature_dim, use_film, use_simple_block, sharing_patterns, stem_num_layers, stem_batchnorm, stem_subsample_layers, stem_kernel_size, stem_stride, stem_padding, stem_dim, module_dim, module_kernel_size, module_input_proj, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.module_dim = module_dim # should be 0 or 1 to indicate the use of film block or not (0 would bring you back to the original EE model) self.use_film = use_film # should be 0 or 1 to indicate if we are using ResNets or a simple 3x3 conv followed by ReLU self.use_simple_block = use_simple_block # this should be a list of two elements (either 0 or 1). It's only active if self.use_film == 1 # The first element of 1 indicates the sharing of CNN weights in the film blocks, 0 otheriwse # The second element of 1 indicate the sharing of film coefficient in the film blocks, 0 otherwise # so [1,0] would be sharing the CNN weights while having different film coefficients for different modules in the program self.sharing_patterns = sharing_patterns self.stem = build_stem(feature_dim[0], stem_dim, module_dim, num_layers=stem_num_layers, subsample_layers=stem_subsample_layers, kernel_size=stem_kernel_size, padding=stem_padding, with_batchnorm=stem_batchnorm) tmp = self.stem( Variable( torch.zeros( [1, feature_dim[0], feature_dim[1], feature_dim[2]]))) module_H = tmp.size(2) module_W = tmp.size(3) self.coords = coord_map((module_H, module_W)) if verbose: print('Here is my stem:') print(self.stem) num_answers = len(vocab['answer_idx_to_token']) self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.fn_str_2_filmId = {} self.vocab = vocab for fn_str in vocab['program_token_to_idx']: num_inputs = vocab['program_token_arity'][fn_str] self.function_modules_num_inputs[fn_str] = num_inputs if self.use_film: if self.sharing_patterns[1] == 1: self.fn_str_2_filmId[fn_str] = 0 else: self.fn_str_2_filmId[fn_str] = len(self.fn_str_2_filmId) if fn_str == 'scene' or num_inputs == 1: if self.use_film: if self.sharing_patterns[0] == 1: mod = None else: mod = FiLMedResBlock( module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, # was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) else: if self.use_simple_block: mod = SimpleVisualBlock(module_dim, kernel_size=module_kernel_size) else: mod = ResidualBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: if self.use_film: if self.sharing_patterns[0] == 1: mod = None else: mod = ConcatFiLMedResBlock( 2, module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, #was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) else: mod = ConcatBlock(module_dim, kernel_size=module_kernel_size, with_residual=module_residual, with_batchnorm=module_batchnorm) else: raise Exception('Not implemented!') if mod is not None: self.add_module(fn_str, mod) self.function_modules[fn_str] = mod if self.use_film and self.sharing_patterns[0] == 1: mod = ConcatFiLMedResBlock( 2, module_dim, with_residual=module_residual, with_intermediate_batchnorm=False, with_batchnorm=False, with_cond=[True, True], num_extra_channels=2, #was 2 for original film, extra_channel_freq=1, with_input_proj=module_input_proj, num_cond_maps=0, kernel_size=module_kernel_size, batchnorm_affine=False, num_layers=1, condition_method='bn-film', debug_every=float('inf')) self.add_module('shared_film', mod) self.function_modules['shared_film'] = mod self.declare_film_coefficients() self.save_module_outputs = False