def build_cnn(feat_dim=(1024, 14, 14), res_block_dim=128, num_res_blocks=0, proj_dim=512, pooling='maxpool2'): C, H, W = feat_dim layers = [] if num_res_blocks > 0: layers.append( tf.keras.layers.Conv2D(C, res_block_dim, kernel_size=(3, 3), padding=1)) layers.append(nn.relu()) C = res_block_dim for _ in range(num_res_blocks): layers.append(ResidualBlock(C)) if proj_dim > 0: layers.append( tf.keras.layers.Conv2D(C, proj_dim, kernel_size=(1, 1), padding=0)) layers.append(tf.keras.layers.ReLU()) C = proj_dim if pooling == 'maxpool2': layers.append(tf.keras.layers.MaxPool2D(kernel_size=(2, 2), stride=2)) H, W = H // 2, W // 2 model = tf.keras.Sequential() for layer in layers: model.add(layer) return model, (C, H, W)
def __init__(self, dim, with_residual=True, with_batchnorm=True): super(ConcatBlock, self).__init__() self.proj = tf.keras.layers.Conv2D(dim, kernel_size=(1, 1), padding='same') self.res_block = ResidualBlock(dim, with_residual=with_residual, with_batchnorm=with_batchnorm)
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, module_dim=128, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024,), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(feature_dim[0], module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm) if verbose: print('Here is my stem:') print(self.stem) num_answers = len(vocab['answer_idx_to_token']) module_H, module_W = feature_dim[1], feature_dim[2] self.classifier = build_classifier(module_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab for fn_str in vocab['program_token_to_idx']: num_inputs = iep.programs.get_num_inputs(fn_str) self.function_modules_num_inputs[fn_str] = num_inputs if fn_str == 'scene' or num_inputs == 1: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: mod = ConcatBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.function_modules[fn_str] = mod self.save_module_outputs = False
def build_classifier(module_C, module_H, module_W, fc_dims=[], proj_dim=None, downsample='maxpool2', with_batchnorm=True, dropout=0): res_block = ResidualBlock(module_C, with_residual=True, with_batchnorm=False) layers = [res_block] layers.append(nn.Conv2d(module_C, module_C, kernel_size=1)) if with_batchnorm: layers.append(nn.BatchNorm2d(module_dim)) layers.append(nn.ReLU(inplace=True)) upsample = nn.Upsample(size=[320, 320], mode='bilinear') layers.append(upsample) if with_batchnorm: layers.append(nn.BatchNorm2d(module_dim)) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Conv2d(module_C, module_C, kernel_size=1)) if with_batchnorm: layers.append(nn.BatchNorm2d(module_dim)) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Conv2d(module_C, module_C // 4, kernel_size=1)) if with_batchnorm: layers.append(nn.BatchNorm2d(module_dim)) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Conv2d(module_C // 4, 4, kernel_size=1)) if with_batchnorm: layers.append(nn.BatchNorm2d(module_dim)) layers.append(nn.ReLU(inplace=True)) layers.append(nn.Conv2d(4, 2, kernel_size=1)) if with_batchnorm: layers.append(nn.BatchNorm2d(module_dim)) return nn.Sequential(*layers)
def build_cnn(feat_dim=(1024, 14, 14), res_block_dim=128, num_res_blocks=0, proj_dim=512, pooling='maxpool2'): C, H, W = feat_dim layers = [] if num_res_blocks > 0: layers.append(nn.Conv2d(C, res_block_dim, kernel_size=3, padding=1)) layers.append(nn.ReLU(inplace=True)) C = res_block_dim for _ in range(num_res_blocks): layers.append(ResidualBlock(C)) if proj_dim > 0: layers.append(nn.Conv2d(C, proj_dim, kernel_size=1, padding=0)) layers.append(nn.ReLU(inplace=True)) C = proj_dim if pooling == 'maxpool2': layers.append(nn.MaxPool2d(kernel_size=2, stride=2)) H, W = H // 2, W // 2 return nn.Sequential(*layers), (C, H, W)
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, module_dim=128, text_dim=1, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(feature_dim[0], module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm) if verbose: print('Here is my stem:') print(self.stem) self.char_lstm = nn.LSTM(input_size=28, hidden_size=98, bidirectional=True, batch_first=True) encoder_layer = nn.TransformerEncoderLayer(d_model=28, nhead=7) self.char_transformer = nn.TransformerEncoder( encoder_layer=encoder_layer, num_layers=3) self.char_linear = nn.Linear(28, 196) num_answers = len(vocab['answer_idx_to_token']) module_H, module_W = feature_dim[1], feature_dim[2] self.classifier = build_classifier(module_dim + text_dim, module_H, module_W, num_answers, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab self.module_list = [] for idx, fn_str in enumerate(vocab['program_token_to_idx']): num_inputs = iep.programs.get_num_inputs(fn_str) self.function_modules_num_inputs[fn_str] = num_inputs if fn_str == 'scene' or num_inputs == 1: mod = ResidualBlock(module_dim + text_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: mod = ConcatBlock(module_dim + text_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) self.add_module(fn_str, mod) self.module_list.append(mod) self.function_modules[fn_str] = idx self.module_list = nn.ModuleList(self.module_list) self.save_module_outputs = False
def __init__(self, dim, with_residual=True, with_batchnorm=True): super(ConcatBlock, self).__init__() self.proj = nn.Conv2d(2 * dim, dim, kernel_size=1, padding=0) self.res_block = ResidualBlock(dim, with_residual=with_residual, with_batchnorm=with_batchnorm)
def __init__(self, vocab, feature_dim=(1024, 14, 14), stem_num_layers=2, stem_batchnorm=False, module_dim=128, module_residual=True, module_batchnorm=False, classifier_proj_dim=512, classifier_downsample='maxpool2', classifier_fc_layers=(1024, ), classifier_batchnorm=False, classifier_dropout=0, verbose=True): super(ModuleNet, self).__init__() self.stem = build_stem(feature_dim[0], module_dim, num_layers=stem_num_layers, with_batchnorm=stem_batchnorm) if verbose: print('Here is my stem:') print(self.stem) self.glove = torchtext.vocab.GloVe(name="6B", dim=50) # embedding size = 50 module_H, module_W = feature_dim[1], feature_dim[2] self.classifier = build_classifier(module_dim, module_H, module_W, classifier_fc_layers, classifier_proj_dim, classifier_downsample, with_batchnorm=classifier_batchnorm, dropout=classifier_dropout) if verbose: print('Here is my classifier:') print(self.classifier) self.stem_times = [] self.module_times = [] self.classifier_times = [] self.timing = False self.function_modules = {} self.function_modules_num_inputs = {} self.vocab = vocab print("vocab['program_token_to_idx']={}".format( vocab['program_token_to_idx'])) for fn_str in vocab['program_token_to_idx']: fn_str = str(fn_str) #FIXME: me if (fn_str.split("[")[0] in self.function_modules): continue num_inputs = iep.programs.get_num_inputs(fn_str) self.function_modules_num_inputs[fn_str.split("[")[0]] = num_inputs #FIXME: # if fn_str == 'scene' or num_inputs == 1: # mod = ResidualBlock(module_dim, # with_residual=module_residual, # with_batchnorm=module_batchnorm) # elif num_inputs == 2: # mod = ConcatBlock(module_dim, # with_residual=module_residual, # with_batchnorm=module_batchnorm) if fn_str == 'scene': mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 1 and len(fn_str.split("[")) < 2: mod = ResidualBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 1 and len(fn_str.split("[")) >= 2: mod = ResidualBlock_LangAttention( module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) elif num_inputs == 2: mod = ConcatBlock(module_dim, with_residual=module_residual, with_batchnorm=module_batchnorm) #FIXME: me #self.add_module(fn_str, mod) #self.function_modules[fn_str] = mod self.add_module(fn_str.split("[")[0], mod) self.function_modules[fn_str.split("[")[0]] = mod self.save_module_outputs = False