def silent_net(): n = caffe.NetSpec() n.data, n.data2 = L.DummyData(shape=[dict(dim=[3]), dict(dim=[4, 2])], ntop=2) n.silence_data = L.Silence(n.data, ntop=0) n.silence_data2 = L.Silence(n.data2, ntop=0) return n.to_proto()
def test_type_error(self): """Test that a TypeError is raised when a Function input isn't a Top.""" data = L.DummyData(ntop=2) # data is a 2-tuple of Tops r = r"^Silence input 0 is not a Top \(type is <(type|class) 'tuple'>\)$" with self.assertRaisesRegexp(TypeError, r): L.Silence(data, ntop=0) # should raise: data is a tuple, not a Top L.Silence(*data, ntop=0) # shouldn't raise: each elt of data is a Top
def silence(self, bottom): if isinstance(bottom, list): self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence( *bottom, ntop=0) else: self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence( bottom, ntop=0) self.silence_count += 1
def start(args): #data_shape = [args.depth, args.width, args.height] input_shape = [132, 132, 132] output_shape = [44, 44, 44] # Start a network net = caffe.NetSpec() # Data input layer #net.data = L.MemoryData(dim=[1, 1], ntop=1) net.data, net.datai = L.MemoryData(dim=[1, 1] + input_shape, ntop=2) # Label input layer net.label, net.labeli = L.MemoryData(dim=[1, 3] + output_shape, ntop=2, include=[dict(phase=0)]) # Components label layer net.components, net.componentsi = L.MemoryData( dim=[1, 1] + output_shape, ntop=2, include=[dict(phase=0, stage='malis')]) # Scale input layer net.scale, net.scalei = L.MemoryData( dim=[1, 3] + output_shape, ntop=2, include=[dict(phase=0, stage='euclid')]) # Silence the not needed data and label integer values net.nhood, net.nhoodi = L.MemoryData( dim=[1, 1, 3, 3], ntop=2, include=[dict(phase=0, stage='malis')]) # Silence the not needed data and label integer values net.silence1 = L.Silence(net.datai, net.labeli, net.scalei, ntop=0, include=[dict(phase=0, stage='euclid')]) net.silence2 = L.Silence(net.datai, net.labeli, net.componentsi, net.nhoodi, ntop=0, include=[dict(phase=0, stage='malis')]) net.silence3 = L.Silence(net.datai, ntop=0, include=[dict(phase=1)]) return net
def get_phocnet(self, word_image_lmdb_path, phoc_lmdb_path, phoc_size=604, generate_deploy=False): ''' Returns a NetSpec definition of the PHOCNet. The definition can then be transformed into a protobuffer message by casting it into a str. ''' n = NetSpec() # Data self.set_phocnet_data(n=n, generate_deploy=generate_deploy, word_image_lmdb_path=word_image_lmdb_path, phoc_lmdb_path=phoc_lmdb_path) # Conv Part self.set_phocnet_conv_body(n=n, relu_in_place=True) # FC Part n.spp5 = L.SPP(n.relu4_3, spp_param=dict(pool=P.SPP.MAX, pyramid_height=3, engine=self.spp_engine)) n.fc6, n.relu6, n.drop6 = self.fc_relu(bottom=n.spp5, layer_size=4096, dropout_ratio=0.5, relu_in_place=True) n.fc7, n.relu7, n.drop7 = self.fc_relu(bottom=n.drop6, layer_size=4096, dropout_ratio=0.5, relu_in_place=True) n.fc8 = L.InnerProduct(n.drop7, num_output=phoc_size, weight_filler=dict(type=self.initialization), bias_filler=dict(type='constant')) n.sigmoid = L.Sigmoid(n.fc8, include=dict(phase=self.phase_test)) # output part if not generate_deploy: n.silence = L.Silence(n.sigmoid, ntop=0, include=dict(phase=self.phase_test)) n.loss = L.SigmoidCrossEntropyLoss(n.fc8, n.phocs) return n.to_proto()
def minivggnet(data, labels=None, train=False, cudnn=False, param=learned_param, num_classes=100, with_labels=True): """ Returns a protobuf text file specifying a variant of VGG """ n = caffe.NetSpec() n.data = data conv_kwargs = dict(param=param, train=train, cudnn=cudnn) n.conv1, n.relu1 = conv_relu(n.data, 7, 96, stride=2, **conv_kwargs) n.norm1 = L.LRN(n.relu1, local_size=5, alpha=0.0005, beta=0.75, k=2) n.pool1 = max_pool(n.norm1, 3, stride=3, train=train, cudnn=cudnn) n.conv2, n.relu2 = conv_relu(n.pool1, 5, 256, pad=1, stride=2, group=2, **conv_kwargs) n.pool2 = max_pool(n.relu2, 2, stride=2, train=train, cudnn=cudnn) n.conv3, n.relu3 = conv_relu(n.pool2, 3, 512, pad=1, **conv_kwargs) n.conv4, n.relu4 = conv_relu(n.relu3, 3, 512, pad=1, group=2, **conv_kwargs) n.conv5, n.relu5 = conv_relu(n.relu4, 3, 512, pad=1, group=2, **conv_kwargs) n.pool5 = max_pool(n.relu5, 3, stride=3, train=train, cudnn=cudnn) n.fc6, n.relu6 = fc_relu(n.pool5, 1024, param=param) n.drop6 = L.Dropout(n.relu6, in_place=True) n.fc7, n.relu7 = fc_relu(n.drop6, 1024, param=param) n.drop7 = L.Dropout(n.relu7, in_place=True) preds = n.fc8 = L.InnerProduct(n.drop7, num_output=num_classes, param=param) if not train: # Compute the per-label probabilities at test/inference time. preds = n.probs = L.Softmax(n.fc8) if with_labels: n.label = labels n.loss = L.SoftmaxWithLoss(n.fc8, n.label) n.accuracy_at_1 = L.Accuracy(preds, n.label) n.accuracy_at_5 = L.Accuracy(preds, n.label, accuracy_param=dict(top_k=5)) else: n.ignored_label = labels n.silence_label = L.Silence(n.ignored_label, ntop=0) return to_tempfile(str(n.to_proto()))
def conv1_autoencoder(split, batch_sz): n = caffe.NetSpec() n.data, n.label = L.ImageData(image_data_param=dict(source=split, batch_size=batch_sz, new_height=height, new_width=width, is_color=False), ntop=2) n.silence = L.Silence(n.label, ntop=0) n.flatdata_i = L.Flatten(n.data) n.conv1 = conv(n.data, 5, 5, 64, pad=2) n.bn1 = L.BatchNorm(n.conv1, use_global_stats=False, in_place=True, param=[{ "lr_mult": 0 }, { "lr_mult": 0 }, { "lr_mult": 0 }]) n.scale1 = L.Scale(n.bn1, bias_term=True, in_place=True) n.relu1 = L.ReLU(n.scale1, relu_param=dict(negative_slope=0.1)) n.pool1 = max_pool(n.relu1, 2, stride=2) n.code = conv(n.pool1, 5, 5, 64, pad=2) n.upsample1 = L.Deconvolution(n.code, param=dict(lr_mult=0, decay_mult=0), convolution_param=dict( group=64, num_output=64, kernel_size=4, stride=2, pad=1, bias_term=False, weight_filler=dict(type="bilinear"))) n.deconv1 = conv(n.upsample1, 5, 5, 1, pad=2) n.debn1 = L.BatchNorm(n.deconv1, use_global_stats=False, in_place=True, param=[{ "lr_mult": 0 }, { "lr_mult": 0 }, { "lr_mult": 0 }]) n.descale1 = L.Scale(n.debn1, bias_term=True, in_place=True) n.derelu1 = L.ReLU(n.descale1, relu_param=dict(negative_slope=0.1)) n.flatdata_o = L.Flatten(n.derelu1) n.loss_s = L.SigmoidCrossEntropyLoss(n.flatdata_o, n.flatdata_i, loss_weight=1) n.loss_e = L.EuclideanLoss(n.flatdata_o, n.flatdata_i, loss_weight=0) return str(n.to_proto())
def build_retrieval_model(self, param_str, save_tag): data = L.Python(module="data_processing", layer=self.data_layer, param_str=str(param_str), ntop=self.top_size) for key, value in zip(self.params['top_names_dict'].keys(), self.params['top_names_dict'].values()): setattr(self.n, key, data[value]) im_model, lang_model = self.get_models() data_bottoms = [] #bottoms which are always produced bottom_positive = data[self.top_name_dict['features_p']] query = data[self.top_name_dict['BoG']] p_time_stamp = data[self.top_name_dict['features_time_stamp_p']] n_time_stamp = data[self.top_name_dict['features_time_stamp_n']] if self.inter: bottom_inter = data[self.top_name_dict['features_inter']] if self.intra: bottom_intra = data[self.top_name_dict['features_intra']] bottom_positive = im_model(bottom_positive, p_time_stamp) if self.inter: bottom_inter = im_model(bottom_inter, p_time_stamp) if self.intra: bottom_intra = im_model(bottom_intra, n_time_stamp) if (self.inter) & (not self.intra): self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence( n_time_stamp, ntop=0) self.silence_count += 1 cont = data[self.top_name_dict['cont']] query = lang_model(query, cont) if not args.tall_loss: if self.inter: self.n.tops['ranking_loss_inter'] = self.ranking_loss( bottom_positive, bottom_inter, query, lw=self.lw_inter) if self.intra: self.n.tops['ranking_loss_intra'] = self.ranking_loss( bottom_positive, bottom_intra, query, lw=self.lw_intra) else: if self.inter: self.n.tops['tall_loss_inter'] = self.tall_loss( bottom_positive, bottom_inter, query, lw=self.lw_inter) if self.intra: self.n.tops['tall_loss_intra'] = self.tall_loss( bottom_positive, bottom_intra, query, lw=self.lw_intra) self.write_net(save_tag, self.n)
def make_net_train(lmdb, preselection, batch_size=8, weights = [0, 0, 0.005, 0.01, 0.02, 0.08, 0.32]): net = caffe.NetSpec() net.img0, net.img1, net.flow_gt, net.aux= L.CustomData( data_param=dict(source=lmdb, preselection_file = preselection, backend=P.Data.LMDB, batch_size=batch_size, preselection_label=1, rand_permute=True, rand_permute_seed=77, slice_point=[3,6,8], encoding=[1,1,2,3], verbose=True), ntop=4, include=dict(phase=0)) net.img0_subtract = L.Eltwise(net.img0, eltwise_param=dict(operation=1,coeff=0.00392156862745)) net.img1_subtract = L.Eltwise(net.img1, eltwise_param=dict(operation=1,coeff=0.00392156862745)) net.img0_aug, net.img0_aug_params = augment_first_image(net.img0_subtract) aug_params = generate_aug_params(net.img0_aug_params, net.img0_subtract, net.img0_aug) net.img1_aug = augment_second_image(net.img1_subtract, aug_params) net.flow_gt_aug = L.FlowAugmentation(net.flow_gt, net.img0_aug_params, aug_params, augmentation_param=dict(crop_width=448, crop_height=320)) net.scaled_flow_gt = L.Eltwise(net.flow_gt_aug, eltwise_param=dict(operation=1,coeff=0.05)) net = make_pwc_net_encoder_plus(net, net.img0_aug, net.img1_aug) for i in range(1, len(weights)): if weights[i] > 0.: scaled_flow_name = 'scaled_flow_gt{}'.format(i) predict_flow_name = 'predict_flow{}'.format(i) loss_name = 'loss{}'.format(i) setattr(net, scaled_flow_name, L.Downsample(net.scaled_flow_gt, getattr(net, predict_flow_name), propagate_down=[False, False]) ) setattr(net, loss_name, L.L1Loss(getattr(net, predict_flow_name), getattr(net, scaled_flow_name), loss_weight=weights[i], l1_loss_param=dict(l2_per_location=True))) # loss at level 0: don't scale GT if weights[0] > 0.: net.loss0 = L.L1Loss(net.predict_flow0, net.scaled_flow_gt, loss_weight=weights[0] , l1_loss_param=dict(l2_per_location=True), propagate_down=[True, False]) net.Silence0 = L.Silence(net.img0, ntop=0) net.Silence1 = L.Silence(net.img1, ntop=0) net.Silence2 = L.Silence(net.flow_gt, ntop=0) net.Silence3 = L.Silence(net.aux, ntop=0) # net.Silence4 = L.Silence(net.predict_flow2_scale, ntop=0) return net.to_proto()
def add_cnn(n, data, act, batch_size, T, K, num_step, mode='train'): n.x_flat = L.Flatten(data, axis=1, end_axis=2) n.act_flat = L.Flatten(act, axis=1, end_axis=2) if mode == 'train': x = L.Slice(n.x_flat, axis=1, ntop=T) act_slice = L.Slice(n.act_flat, axis=1, ntop=T - 1) x_set = () label_set = () x_hat_set = () silence_set = () for i in range(T): t = tag(i + 1) n.tops['x' + t] = x[i] if i < K: x_set += (x[i], ) if i < T - 1: n.tops['act' + t] = act_slice[i] if i < K - 1: silence_set += (n.tops['act' + t], ) if i >= K: label_set += (x[i], ) n.label = L.Concat(*label_set, axis=0) input_list = list(x_set) for step in range(0, num_step): step_tag = tag(step + 1) if step > 0 else '' t = tag(step + K) tp = tag(step + K + 1) input_tuple = tuple(input_list) n.tops['input' + step_tag] = L.Concat(*input_tuple, axis=1) top = add_conv_enc(n, n.tops['input' + step_tag], tag=step_tag) n.tops['x_hat' + tp] = add_decoder(n, top, n.tops['act' + t], flatten=False, tag=step_tag) input_list.pop(0) input_list.append(n.tops['x_hat' + tp]) else: top = add_conv_enc(n, n.x_flat) n.tops['x_hat' + tag(K + 1)] = add_decoder(n, top, n.act_flat, flatten=False) if mode == 'train': x_hat = () for i in range(K, T): t = tag(i + 1) x_hat += (n.tops['x_hat' + t], ) n.x_hat = L.Concat(*x_hat, axis=0) n.silence = L.Silence(*silence_set, ntop=0) n.l2_loss = L.EuclideanLoss(n.x_hat, n.label) return n
def exp_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.exp_att_feature, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='exp_data_provider_layer', layer='ExpDataProviderLayer', param_str=mode_str, ntop=5) n.exp_embed_ba = L.Embed(n.exp, input_dim=exp_vocab_size, num_output=300, \ weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_embed = L.TanH(n.exp_embed_ba) # LSTM1 for Explanation n.exp_lstm1 = L.LSTM(\ n.exp_embed, n.exp_cont_1,\ recurrent_param=dict(\ num_output=2048,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm1_dropped = L.Dropout(n.exp_lstm1, dropout_param={'dropout_ratio': 0.3}) # Merge with LSTM1 for explanation n.exp_att_resh = L.Reshape( n.exp_att_feature, reshape_param=dict(shape=dict(dim=[1, -1, 2048]))) n.exp_att_tiled = L.Tile(n.exp_att_resh, axis=0, tiles=exp_T) n.exp_eltwise_all = L.Eltwise(n.exp_lstm1_dropped, n.exp_att_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_all_sqrt = L.SignedSqrt(n.exp_eltwise_all) n.exp_eltwise_all_l2 = L.L2Normalize(n.exp_eltwise_all_sqrt) n.exp_eltwise_all_drop = L.Dropout(n.exp_eltwise_all_l2, dropout_param={'dropout_ratio': 0.3}) # LSTM2 for Explanation n.exp_lstm2 = L.LSTM(\ n.exp_eltwise_all_drop, n.exp_cont_2,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) n.exp_lstm2_dropped = L.Dropout(n.exp_lstm2, dropout_param={'dropout_ratio': 0.3}) n.exp_prediction = L.InnerProduct(n.exp_lstm2_dropped, num_output=exp_vocab_size, weight_filler=dict(type='xavier'), axis=2) n.silence_exp_prediction = L.Silence(n.exp_prediction, ntop=0) return n.to_proto()
def datalayer_test(imdb, batch_size=4): from caffe import layers as L, params as P, to_proto from caffe.proto import caffe_pb2 w_filler_params = {'weight_filler': {'type': 'xavier'}} b_filler_params = {'bias_filler': {'type': 'constant', 'value': 0.01}} n = caffe.NetSpec() n.image, n.depth = L.Python(name='data_train', ntop=2, include={'phase':0}, \ python_param={'module':'data_layer', 'layer':'EigenDataLayer', 'param_str': "{'data_type': 'train', 'year': '2012'}"}) n.image, n.depth = L.Python(name='data_test', ntop=2, include={'phase':1}, \ python_param={'module':'data_layer', 'layer':'EigenDataLayer', 'param_str': "{'data_type': 'test', 'year': '2012'}"}) n.image_s = L.Silence(n.image, name='silence_image', ntop=0) n.depth_s = L.Silence(n.depth, name='silence_depth', ntop=0) # n.conv1_1 = L.Convolution( n.image, name='conv1_1', convolution_param={'num_output': 64, 'kernel_size': 3, 'pad': 0, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } ) # n.conv1_1 = L.ReLU( n.conv1_1 ) # n.conv1_2 = L.Convolution( n.conv1_1, name='conv1_2', convolution_param={'num_output': 64, 'kernel_size': 3, 'pad': 0, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } ) # n.conv1_2 = L.ReLU( n.conv1_1 ) # n.pool1_2 = L.Pooling # n.conv2 = L.Convolution( n.lrn1, name='conv2', convolution_param={'num_output': 64, 'kernel_size': 3, 'pad': 1, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } ) # n.conv2 = L.ReLU( n.conv2 ) # n.lrn2 = L.LRN( n.conv2, name='lrn2', lrn_param={'local_size': 5, 'alpha': 0.0001, 'beta': 0.75} ) # n.pred = L.Convolution( n.lrn2, name='conv3', convolution_param={'num_output': 1, 'kernel_size': 3, 'pad': 1, 'weight_filler': {'type': 'xavier'}, 'bias_filler': {'type': 'constant', 'value': 0.01} } ) # n.lossSqrSum, n.lossSumSqr, n.lossSmooth = L.Python(n.pred, n.depth, name='loss', ntop=3, loss_weight=[1,1,1], \ # python_param={'module':'loss_layer', 'layer':'EigenLossLayer'}) # n.loss = L.EuclideanLoss(n.pred, n.depth, ntop=1) # n.lrn1_2 = L.LRN( n.conv1_1, name='lrn1', lrn_param={'local_size': 5, 'alpha': 0.0001, 'beta': 0.75} ) return n.to_proto()
def define_network(args, imageFile, vidIds, radarFiles, training=False): net = caffe.NetSpec() # Setting up data layer transformParam = dict(mirror=training, mean_value = args.mean) pydataParams = dict(radar_files = radarFiles, videos = vidIds, batch_size = args.batchSize) net.data, net.label = L.ImageData(transform_param = transformParam, source=imageFile, shuffle=False, batch_size=args.batchSize, ntop=2) if args.expType != 'image': net.radar = L.Python(module='radarDataLayer', layer='RadarDataLayer', param_str=str(pydataParams), ntop=1) if args.expType == "joint" or args.expType == "image": net.conv1, net.relu1 = conv_relu(net.data, 11, 96, stride=4) net.pool1 = max_pool(net.relu1, 3, stride=2) net.norm1 = L.LRN(net.pool1, local_size=5, alpha=1e-4, beta=0.75) net.conv2, net.relu2 = conv_relu(net.norm1, 5, 256, pad=2, group=2) net.pool2 = max_pool(net.relu2, 3, stride=2) net.norm2 = L.LRN(net.pool2, local_size=5, alpha=1e-4, beta=0.75) net.conv3, net.relu3 = conv_relu(net.norm2, 3, 384, pad=1) net.conv4, net.relu4 = conv_relu(net.relu3, 3, 384, pad=1, group=2) net.conv5, net.relu5 = conv_relu(net.relu4, 3, 256, pad=1, group=2) net.pool5 = max_pool(net.relu5, 3, stride=2) net.fc6_new, net.relu6_new = fc_relu(net.pool5, 4096) net.drop6 = L.Dropout(net.relu6_new, in_place=True) net.fc7_new = L.InnerProduct(net.drop6, num_output=4096, param=learned_param, weight_filler=fc_filler) if args.expType == "joint": net.concat = L.Concat(net.fc7_new, net.radar) net.relu7 = L.ReLU(net.concat, in_place=True) else: net.relu7 = L.ReLU(net.fc7_new, in_place=True) net.drop7 = L.Dropout(net.relu7, in_place=True) net.final = L.InnerProduct(net.drop7, num_output=args.num_out, param=learned_param, weight_filler=fc_filler) elif args.expType == "radar": net.silence = L.Silence(net.data, ntop=0) net.fc7_new = L.InnerProduct(net.radar, num_output=1024, param=learned_param, weight_filler=fc_filler) net.relu7 = L.ReLU(net.fc7_new, in_place=True) net.drop7 = L.Dropout(net.relu7, in_place=True) net.final = L.InnerProduct(net.drop7, num_output=args.num_out, param=learned_param, weight_filler=fc_filler) net.loss = L.SoftmaxWithLoss(net.final, net.label) net.acc = L.Accuracy(net.final, net.label) return net.to_proto()
def generate_scores(split, config): n = caffe.NetSpec() batch_size = config.N mode_str = str(dict(split=split, batch_size=batch_size)) n.language, n.cont, n.img_feature, n.spatial, n.label = L.Python(module=config.data_provider, layer='TossLayer', param_str=mode_str, ntop=5) # embedding n.embed = L.Embed(n.language, input_dim=config.vocab_size, num_output=config.embed_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) # LSTM n.lstm = L.LSTM(n.embed, n.cont, recurrent_param=dict(num_output=config.lstm_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08), bias_filler=dict(type='constant', value=0))) tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0)) for i in range(config.T - 1): n.__setattr__('slice'+str(i), tops[i]) n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0)) n.lstm_out = tops[-1] n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim]))) # L2 Normalize image and language features n.img_l2norm = L.L2Normalize(n.img_feature) n.lstm_l2norm = L.L2Normalize(n.lstm_feat) n.img_l2norm_resh = L.Reshape(n.img_l2norm, reshape_param=dict(shape=dict(dim=[-1, config.D_im]))) n.lstm_l2norm_resh = L.Reshape(n.lstm_l2norm, reshape_param=dict(shape=dict(dim=[-1, config.D_text]))) # Concatenate n.feat_all = L.Concat(n.lstm_l2norm_resh, n.img_l2norm_resh, n.spatial, concat_param=dict(axis=1)) # MLP Classifier over concatenated feature n.mlp_l1, n.mlp_relu1 = fc_relu(n.feat_all, config.mlp_hidden_dims) if config.mlp_dropout: n.mlp_drop1 = L.Dropout(n.mlp_relu1, dropout_ratio=0.5, in_place=True) n.scores = fc(n.mlp_drop1, 1) else: n.scores = fc(n.mlp_relu1, 1) # Loss Layer n.loss = L.SigmoidCrossEntropyLoss(n.scores, n.label) return n.to_proto()
def build_test_train(n, top, train, with_labels, labels): """Take in current netspec and top, and adds final layers.""" if train: preds = top else: # Compute the per-label probabilities at test/inference time. preds = n.probs = layers.Softmax(top) if with_labels: n.label = labels n.loss = layers.SoftmaxWithLoss(top, labels) n.accuracy_at_1 = layers.Accuracy(preds, labels) n.accuracy_at_5 = layers.Accuracy(preds, labels, accuracy_param=dict(top_k=5)) else: n.ignored_label = labels n.silence_label = layers.Silence(n.ignored_label, ntop=0)
def build_retrieval_model(self, param_str, save_tag): #TODO: This would perhaps be cleaner if I did not co-sample inter/intra positives negatives; shouldn't have to do that and could get rid of determining top size... #gets all the tops from the data layer, and names them sensible things. data = L.Python(module="data_processing", layer=self.data_layer, param_str=str(param_str), ntop=self.top_size) for key, value in zip(self.params['top_names_dict'].keys(), self.params['top_names_dict'].values()): setattr(self.n, key, data[value]) im_model, lang_model = self.get_models() data_bottoms = [] #bottoms which are always produced bottom_positive = data[self.top_name_dict['features_p']] query = data[self.top_name_dict['query']] p_time_stamp = data[self.top_name_dict['features_time_stamp_p']] n_time_stamp = data[self.top_name_dict['features_time_stamp_n']] if self.inter: bottom_inter = data[self.top_name_dict['features_inter']] if self.intra: bottom_intra = data[self.top_name_dict['features_intra']] bottom_positive = im_model(bottom_positive, p_time_stamp) if self.inter: bottom_inter = im_model(bottom_inter, p_time_stamp) if self.intra: bottom_intra = im_model(bottom_intra, n_time_stamp) if (self.inter) & (not self.intra): self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence( n_time_stamp, ntop=0) self.silence_count += 1 cont = data[self.top_name_dict['cont']] query = lang_model(query, cont) if self.inter: self.n.tops['ranking_loss_inter'] = self.ranking_loss( bottom_positive, bottom_inter, query, lw=self.lw_inter) if self.intra: self.n.tops['ranking_loss_intra'] = self.ranking_loss( bottom_positive, bottom_intra, query, lw=self.lw_intra) self.write_net(save_tag, self.n)
def language_model_lstm_no_embed(self, sent_bottom, cont_bottom, text_name='embedding_text', tag=''): lstm_lr = self.args.lstm_lr embedding_lr = self.args.language_embedding_lr lstm = L.LSTM( sent_bottom, cont_bottom, recurrent_param=dict(num_output=self.language_embedding_dim[0], weight_filler=self.uniform_weight_filler( -0.08, 0.08), bias_filler=self.constant_filler(0)), param=self.learning_params( [[lstm_lr, lstm_lr], [lstm_lr, lstm_lr], [lstm_lr, lstm_lr]], ['lstm1' + tag, 'lstm2' + tag, 'lstm3' + tag])) lstm_slices = L.Slice(lstm, slice_point=self.params['sentence_length'] - 1, axis=0, ntop=2) self.n.tops['silence_cell_' + str(self.silence_count)] = L.Silence( lstm_slices[0], ntop=0) self.silence_count += 1 top_lstm = L.Reshape( lstm_slices[1], shape=dict(dim=[-1, self.language_embedding_dim[0]])) top_text = L.InnerProduct( top_lstm, num_output=self.language_embedding_dim[1], weight_filler=self.uniform_weight_filler(-0.08, .08), bias_filler=self.constant_filler(0), param=self.learning_params( [[embedding_lr, embedding_lr], [embedding_lr * 2, 0]], ['lstm_embed1' + tag, 'lstm_embed_1b' + tag])) setattr(self.n, text_name, top_text) return top_text
def minialexnet(data, labels=None, train=False, param=learned_param, num_classes=100, with_labels=True): """ Returns a protobuf text file specifying a variant of AlexNet, following the original specification (<caffe>/models/bvlc_alexnet/train_val.prototxt). The changes with respect to the original AlexNet are: - LRN (local response normalization) layers are not included - The Fully Connected (FC) layers (fc6 and fc7) have smaller dimensions due to the lower resolution of mini-places images (128x128) compared with ImageNet images (usually resized to 256x256) """ n = caffe.NetSpec() n.data = data conv_kwargs = dict(param=param, train=train) n.conv1, n.relu1 = conv_relu(n.data, 11, 96, stride=4, **conv_kwargs) n.pool1 = max_pool(n.relu1, 3, stride=2, train=train) n.conv2, n.relu2 = conv_relu(n.pool1, 5, 256, pad=2, group=2, **conv_kwargs) n.pool2 = max_pool(n.relu2, 3, stride=2, train=train) n.conv3, n.relu3 = conv_relu(n.pool2, 3, 384, pad=1, **conv_kwargs) n.conv4, n.relu4 = conv_relu(n.relu3, 3, 384, pad=1, group=2, **conv_kwargs) n.conv5, n.relu5 = conv_relu(n.relu4, 3, 256, pad=1, group=2, **conv_kwargs) n.pool5 = max_pool(n.relu5, 3, stride=2, train=train) n.fc6, n.relu6 = fc_relu(n.pool5, 1024, param=param) n.drop6 = L.Dropout(n.relu6, in_place=True) n.fc7, n.relu7 = fc_relu(n.drop6, 1024, param=param) n.drop7 = L.Dropout(n.relu7, in_place=True) preds = n.fc8 = L.InnerProduct(n.drop7, num_output=num_classes, param=param) if not train: # Compute the per-label probabilities at test/inference time. preds = n.probs = L.Softmax(n.fc8) if with_labels: n.label = labels n.loss = L.SoftmaxWithLoss(n.fc8, n.label) n.accuracy_at_1 = L.Accuracy(preds, n.label) n.accuracy_at_5 = L.Accuracy(preds, n.label, accuracy_param=dict(top_k=5)) else: n.ignored_label = labels n.silence_label = L.Silence(n.ignored_label, ntop=0) return to_tempfile(str(n.to_proto()))
def qlstm(mode, batchsize, T, question_vocab_size): #prototxt 없이 network 생성시 사용 n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) #지정된 Python 모듈 형식 #https://stackoverflow.com/questions/41344168/what-is-a-python-layer-in-caffe #해당 클래스를 바탕으로 Layer를 생성하며 #리턴된 변수에 값을 채워넣으면 자동으로 Run된다. #여기서 만들어진 Class 내부에서 실질적인 databatch load가 이루어짐. #Glove = Global vectors for word representation #https://www.aclweb.org/anthology/D14-1162 #Pretrained 된 GloveVector를 Concat에 사용. #img_feature는 이미 Resnet512 통과후 L2를 적용한 Preprocessing이 끝난 상태의 Feature Vector. n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) #module = python 파일이름 #layer = layer형식이 맞춰진 python class #param_str = json으로 Data Load시 사용된 파라미터, 내부 class에 self.param_str = modestr 로 저장된다 #ntop = 각 setup , forward backward의 top 변수의 크기 #보통 textual Embed의 뜻은 => texture -> number #Embed 3000개의 Vector종류를 #300개로 compact하게 표현함 n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) #Tanh 적용 n.embed = L.TanH(n.embed_ba) #Glove Data와 Concat concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) #https://www.programcreek.com/python/example/107865/caffe.NetSpec 참조. # give top2[~] the name specified by argument `slice_second` #변수 부여 기능 for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) #마지막 LSTM output을 사용. n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) #lstm1의 output => 1024 reshape뒤 dropout #lstm2의 output => 1024 reshape뒤 dropout #concat n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #L.Tile 차원을 자동으로 안맞춰주므로 차원맞춤 함수. 2048,1 (tile=14, axis=1) =>2048,14 n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) #논문 그림과 달리 Dropout 추가 n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) #논문 그림과 달리 output dim이 2 n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) #softmax로 attentionmap 생성 #14x14 Softmax map이 2개 생성 n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) #두가지 att_map을 각각 Slice att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) #각각 ATT를 곱한값을 연산뒤 Concat한다. # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) #그뒤 4096으로 Reshape n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) #논문과 달리 가로축 세로축 inputVector크기가 다름 #논문 2048 2048 #코드 4096 2048 n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) #SignedSqrt n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) #L2_Normalize n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) #Dropout n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) #FullyConnected n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def qlstm(mode, batchsize, T, question_vocab_size): n = caffe.NetSpec() mode_str = json.dumps({'mode': mode, 'batchsize': batchsize}) n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08)) n.embed = L.TanH(n.embed_ba) concat_word_embed = [n.embed, n.glove] n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 # LSTM1 n.lstm1 = L.LSTM(\ n.concat_embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_first' + str(i), tops1[int(i)]) n.__setattr__('silence_data_first' + str(i), L.Silence(tops1[int(i)], ntop=0)) n.lstm1_out = tops1[T - 1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped, dropout_param={'dropout_ratio': 0.3}) n.lstm1_droped = L.Dropout(n.lstm1, dropout_param={'dropout_ratio': 0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0))) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis': 0}) for i in xrange(T - 1): n.__setattr__('slice_second' + str(i), tops2[int(i)]) n.__setattr__('silence_data_second' + str(i), L.Silence(tops2[int(i)], ntop=0)) n.lstm2_out = tops2[T - 1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped, dropout_param={'dropout_ratio': 0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) n.q_emb_tanh_droped_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) n.i_emb_tanh_droped_resh = L.Reshape( n.img_feature, reshape_param=dict(shape=dict(dim=[-1, 2048, 14, 14]))) n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000, sum_pool=False)) n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) # multi-channel attention n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) n.att_reshaped = L.Reshape( n.att_conv2, reshape_param=dict(shape=dict(dim=[-1, 2, 14 * 14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att = L.Reshape(n.att_softmax, reshape_param=dict(shape=dict(dim=[-1, 2, 14, 14]))) att_maps = L.Slice(n.att, ntop=2, slice_param={'axis': 1}) n.att_map0 = att_maps[0] n.att_map1 = att_maps[1] dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) n.att_feature0_resh = L.Reshape( n.att_feature0, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature1_resh = L.Reshape( n.att_feature1, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) # merge attention and lstm with compact bilinear pooling n.att_feature_resh = L.Reshape( n.att_feature, reshape_param=dict(shape=dict(dim=[-1, 4096, 1, 1]))) n.lstm_12_resh = L.Reshape( n.lstm_12, reshape_param=dict(shape=dict(dim=[-1, 2048, 1, 1]))) n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, compact_bilinear_param=dict( num_output=16000, sum_pool=False)) n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio': 0.1}) n.bc_dropped_resh = L.Reshape( n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) n.loss = L.SoftmaxWithLoss(n.prediction, n.label) return n.to_proto()
def caffenet(netmode): # Start Caffe proto net net = caffe.NetSpec() # Specify input data structures if netmode == caffe_pb2.TEST: if netconf.loss_function == 'malis': fmaps_end = 11 if netconf.loss_function == 'euclid': fmaps_end = 11 if netconf.loss_function == 'softmax': fmaps_end = 2 net.data, net.datai = data_layer([1, 1, 44, 132, 132]) net.silence = L.Silence(net.datai, ntop=0) # Shape specs: # 00. Convolution buffer size # 01. Weight memory size # 03. Num. channels # 04. [d] parameter running value # 05. [w] parameter running value run_shape_in = [[0, 0, 1, [1, 1, 1], [44, 132, 132]]] run_shape_out = run_shape_in last_blob = implement_usknet(net, run_shape_out, 64, fmaps_end) # Implement the prediction layer if netconf.loss_function == 'malis': net.prob = L.Sigmoid(last_blob, ntop=1) if netconf.loss_function == 'euclid': net.prob = L.Sigmoid(last_blob, ntop=1) if netconf.loss_function == 'softmax': net.prob = L.Softmax(last_blob, ntop=1) for i in range(0, len(run_shape_out)): print(run_shape_out[i]) print("Max. memory requirements: %s B" % (compute_memory_buffers(run_shape_out) + compute_memory_weights(run_shape_out) + compute_memory_blobs(run_shape_out))) print("Weight memory: %s B" % compute_memory_weights(run_shape_out)) print("Max. conv buffer: %s B" % compute_memory_buffers(run_shape_out)) else: if netconf.loss_function == 'malis': net.data, net.datai = data_layer([1, 1, 44, 132, 132]) net.label, net.labeli = data_layer([1, 1, 16, 44, 44]) net.label_affinity, net.label_affinityi = data_layer( [1, 11, 16, 44, 44]) net.affinity_edges, net.affinity_edgesi = data_layer([1, 1, 11, 3]) net.silence = L.Silence(net.datai, net.labeli, net.label_affinityi, net.affinity_edgesi, ntop=0) fmaps_end = 11 if netconf.loss_function == 'euclid': net.data, net.datai = data_layer([1, 1, 44, 132, 132]) net.label, net.labeli = data_layer([1, 11, 16, 44, 44]) net.scale, net.scalei = data_layer([1, 11, 16, 44, 44]) net.silence = L.Silence(net.datai, net.labeli, net.scalei, ntop=0) fmaps_end = 11 if netconf.loss_function == 'softmax': net.data, net.datai = data_layer([1, 1, 44, 132, 132]) # Currently only supports binary classification net.label, net.labeli = data_layer([1, 1, 16, 44, 44]) net.silence = L.Silence(net.datai, net.labeli, ntop=0) fmaps_end = 2 run_shape_in = [[0, 1, 1, [1, 1, 1], [44, 132, 132]]] run_shape_out = run_shape_in # Start the actual network last_blob = implement_usknet(net, run_shape_out, 64, fmaps_end) for i in range(0, len(run_shape_out)): print(run_shape_out[i]) print("Max. memory requirements: %s B" % (compute_memory_buffers(run_shape_out) + compute_memory_weights(run_shape_out) + 2 * compute_memory_blobs(run_shape_out))) print("Weight memory: %s B" % compute_memory_weights(run_shape_out)) print("Max. conv buffer: %s B" % compute_memory_buffers(run_shape_out)) # Implement the loss if netconf.loss_function == 'malis': last_blob = L.Sigmoid(last_blob, in_place=True) net.loss = L.MalisLoss(last_blob, net.label_affinity, net.label, net.affinity_edges, ntop=0) if netconf.loss_function == 'euclid': last_blob = L.Sigmoid(last_blob, in_place=True) net.loss = L.EuclideanLoss(last_blob, net.label, net.scale, ntop=0) if netconf.loss_function == 'softmax': net.loss = L.SoftmaxWithLoss(last_blob, net.label, ntop=0) # Return the protocol buffer of the generated network return net.to_proto()
def resnet_mask_rcnn_mask_rcnn(self, stage=1): channals = self.channals if not self.deploy: data, rois, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights, mask_rois, masks = \ self.data_layer_train_with_ins(with_rpn=False) im_info = None else: data, im_info = self.data_layer_test() gt_boxes = None if stage == 1: pre_traned_fixed = False else: pre_traned_fixed = True conv1 = self.conv_factory("conv1", data, 7, channals, 2, 3, bias_term=True, fixed=pre_traned_fixed) pool1 = self.pooling_layer(3, 2, 'MAX', 'pool1', conv1) index = 1 out = pool1 if self.module == "normal": residual_block = self.residual_block else: residual_block = self.residual_block_basic for i in self.stages[:-1]: index += 1 for j in range(i): if j == 0: if index == 2: stride = 1 else: stride = 2 out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride, fixed=pre_traned_fixed) else: out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, fixed=pre_traned_fixed) channals *= 2 if not self.deploy: rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data, fixed=True) self.net["silence_rpn_cls_score_reshape"] = L.Silence(rpn_cls_score_reshape, ntop=0) self.net["silence_rpn_bbox_pred"] = L.Silence(rpn_bbox_pred, ntop=0) else: rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data) rois, scores = self.roi_proposals(rpn_cls_score_reshape, rpn_bbox_pred, im_info, gt_boxes) feat_out = out if not self.deploy: self.net["rois_cat"] = L.Concat(rois, mask_rois, name="rois_cat", axis=0) rois=self.net["rois_cat"] feat_aligned = self.roi_align("det_mask", feat_out, rois) # if not self.deploy: # self.net["silence_mask_rois"] = L.Silence(mask_rois, ntop=0) # if not self.deploy: # mask_feat_aligned = self.roi_align("mask", feat_out, mask_rois) # else: # mask_feat_aligned = self.roi_align("mask", feat_out, rois) out = feat_aligned index += 1 for j in range(self.stages[-1]): if j == 0: stride = 1 out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride) else: out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals) if not self.deploy: self.net["det_feat"], self.net["mask_feat"] = L.Slice(out, ntop=2, name='slice', slice_param=dict(slice_dim=0, slice_point=self.rois_num)) feat_mask = self.net["mask_feat"] out = self.net["det_feat"] # for bbox detection pool5 = self.ave_pool(7, 1, "pool5", out) cls_score, bbox_pred = self.final_cls_bbox(pool5) if not self.deploy: self.net["loss_cls"] = L.SoftmaxWithLoss(cls_score, labels, loss_weight=1, propagate_down=[1, 0]) self.net["loss_bbox"] = L.SmoothL1Loss(bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights, \ loss_weight=1) else: self.net["cls_prob"] = L.Softmax(cls_score) # # for mask prediction if not self.deploy: mask_feat_aligned = feat_mask else: mask_feat_aligned = out # out = mask_feat_aligned out = L.Deconvolution(mask_feat_aligned, name = "mask_deconv1",convolution_param=dict(kernel_size=2, stride=2, num_output=256, pad=0, bias_term=False, weight_filler=dict(type='msra'), bias_filler=dict(type='constant'))) out = L.BatchNorm(out, name="bn_mask_deconv1",in_place=True, batch_norm_param=dict(use_global_stats=self.deploy)) out = L.Scale(out, name = "scale_mask_deconv1", in_place=True, scale_param=dict(bias_term=True)) out = L.ReLU(out, name="mask_deconv1_relu", in_place=True) mask_out = self.conv_factory("mask_out", out, 1, self.classes-1, 1, 0, bias_term=True) # for i in range(4): # out = self.conv_factory("mask_conv"+str(i), out, 3, 256, 1, 1, bias_term=False) # mask_out = self.conv_factory("mask_out", out, 1, 1, 1, 0, bias_term=False) if not self.deploy: self.net["loss_mask"] = L.SigmoidCrossEntropyLoss(mask_out, masks, loss_weight=1, propagate_down=[1, 0], loss_param=dict( normalization=1, ignore_label = -1 )) else: self.net["mask_prob"] = L.Sigmoid(mask_out) return self.net.to_proto()
def resnet_mask_rcnn_rpn(self, stage=1): channals = self.channals if not self.deploy: data, im_info, gt_boxes = self.data_layer_train() else: data, im_info = self.data_layer_test() gt_boxes = None if stage == 1: pre_traned_fixed = True else: pre_traned_fixed = False conv1 = self.conv_factory("conv1", data, 7, channals, 2, 3, bias_term=True, fixed=pre_traned_fixed) pool1 = self.pooling_layer(3, 2, 'MAX', 'pool1', conv1) index = 1 out = pool1 if self.module == "normal": residual_block = self.residual_block else: residual_block = self.residual_block_basic for i in self.stages[:-1]: index += 1 for j in range(i): if j == 0: if index == 2: stride = 1 else: stride = 2 out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride, fixed=pre_traned_fixed) else: out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, fixed=pre_traned_fixed) channals *= 2 if not self.deploy: rpn_cls_loss, rpn_loss_bbox, rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data) else: rpn_cls_score_reshape, rpn_bbox_pred = self.rpn(out, gt_boxes, im_info, data) rois, scores = self.roi_proposals(rpn_cls_score_reshape, rpn_bbox_pred, im_info, gt_boxes) if not self.deploy: self.net["dummy_roi_pool_conv5"] = L.DummyData(name = "dummy_roi_pool_conv5", shape=[dict(dim=[1,channals*2,14,14])]) out = self.net["dummy_roi_pool_conv5"] index += 1 for j in range(self.stages[-1]): if j == 0: stride = 1 out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals, stride) else: out = residual_block("res" + str(index) + ascii_lowercase[j], out, channals) if stage==1: self.net["silence_res"] = L.Silence(out, ntop=0) if stage==2: # for bbox detection pool5 = self.ave_pool(7, 1, "pool5", out) cls_score, bbox_pred = self.final_cls_bbox(pool5) self.net["silence_cls_score"] = L.Silence(cls_score, ntop=0) self.net["silence_bbox_pred"] = L.Silence(bbox_pred, ntop=0) # for mask prediction mask_conv1 = self.conv_factory("mask_conv1", out, 3, 256, 1, 1, bias_term=True) mask_out = self.conv_factory("mask_out", mask_conv1, 1, self.classes, 1, 0, bias_term=True) self.net["silence_mask_out"] = L.Silence(mask_out, ntop=0) return self.net.to_proto()
def compute_valid_io_shapes(netconf, netmode, min_output_shape, max_output_shape, fmaps_in=1, fmaps_out=1, constraints=None): valid_in_shapes = [] valid_out_shapes = [] dims = len(min_output_shape) for current_dim in range(0, dims): filtered_in_shapes = copy.deepcopy(valid_in_shapes) if not (constraints is None) and len(constraints) > current_dim and not ( constraints[current_dim] is None): in_shape = [(constraints[i](filtered_in_shapes[0]) if i >= current_dim else filtered_in_shapes[0][i]) for i in range(0, current_dim + 1)] else: in_shape = [(min_output_shape[i] if i >= current_dim else filtered_in_shapes[0][i]) for i in range(0, current_dim + 1)] in_index = 0 valid_in_shapes = [] valid_out_shapes = [] while (True): net = caffe.NetSpec() run_shape = RunShape(None, None) run_shape.shape = in_shape[0:current_dim + 1] run_shape.dilation = [1 for i in range(0, dims)] run_shape.fmaps = fmaps_in run_shape_in = [run_shape] run_shape_out = run_shape_in netgen = NetworkGenerator(netconf, netmode) limit_reached = False valid_io_shape = True try: net.data, net.datai = netgen.data_layer( [1] + [fmaps_in] + in_shape[0:current_dim + 1]) net.silence = L.Silence(net.datai, ntop=0) # Chained blob list to construct the network (forward direction) blobs = [] # All networks start with data blobs = blobs + [net.data] netgen.implement_usknet(netconf, net, run_shape_out, blobs, 1, fmaps_out) except MemoryLimitException: limit_reached = True valid_io_shape = True except ConvolutionBufferException: limit_reached = True valid_io_shape = True except ShapeException: limit_reached = False valid_io_shape = False except LayerException: limit_reached = False valid_io_shape = False if (valid_io_shape and not limit_reached and not reduce(lambda a, b: a and b, [ run_shape_out[-1].shape[i] >= max_output_shape[i] for i in range(0, current_dim + 1) ], True)): print("++++ Valid: %s => %s" % (run_shape_out[0].shape, run_shape_out[-1].shape)) valid_in_shapes += [run_shape_out[0].shape] valid_out_shapes += [run_shape_out[-1].shape] else: print("-- Invalid: %s => []" % (run_shape_out[0].shape)) incremented = False if not incremented and ((valid_io_shape or limit_reached) and len(filtered_in_shapes) > 0): if in_index >= len(filtered_in_shapes) - 1: in_index = 0 in_shape[0:current_dim] = filtered_in_shapes[in_index] else: in_index += 1 in_shape[0:current_dim] = filtered_in_shapes[in_index] incremented = True if not (constraints is None) and len(constraints) > current_dim and not ( constraints[current_dim] is None): in_shape[current_dim] = constraints[current_dim](in_shape) if in_index > 0: incremented = True else: if not incremented: if in_shape[current_dim] >= max_output_shape[current_dim]: in_shape[current_dim] = min_output_shape[current_dim] else: in_shape[current_dim] += 1 incremented = True if not incremented: break if (len(valid_in_shapes) == 0): break max_fmap_counts = [] for shape_idx in range(0, len(valid_in_shapes)): incexp = True bisect = False fmaps_start = 1 lower_limit = 1 upper_limit = 1 while (True): net = caffe.NetSpec() run_shape = RunShape(None, None) run_shape.shape = valid_in_shapes[shape_idx] run_shape.dilation = [1 for i in range(0, dims)] run_shape.fmaps = 1 run_shape_in = [run_shape] run_shape_out = run_shape_in netgen = NetworkGenerator(netconf, netmode) limit_reached = False valid_io_shape = True try: net.data, net.datai = netgen.data_layer( [1] + [1] + valid_in_shapes[shape_idx]) net.silence = L.Silence(net.datai, ntop=0) # Chained blob list to construct the network (forward direction) blobs = [] # All networks start with data blobs = blobs + [net.data] netgen.implement_usknet(netconf, net, run_shape_out, blobs, fmaps_start, fmaps_out) except (MemoryLimitException, ConvolutionBufferException, ShapeException, LayerException): limit_reached = True if (not limit_reached and incexp): fmaps_start *= 2 elif (limit_reached and incexp): incexp = False bisect = True lower_limit = fmaps_start / 2 upper_limit = fmaps_start elif (not limit_reached and bisect): if (lower_limit >= upper_limit): break lower_limit = fmaps_start + 1 elif (limit_reached and bisect): upper_limit = fmaps_start - 1 if bisect: fmaps_start = (upper_limit + lower_limit) / 2 print("%s in [%s, %s]" % (fmaps_start, lower_limit, upper_limit)) max_fmap_counts += [upper_limit] print("Current shape: %s, %s, %s" % (shape_idx, valid_in_shapes[shape_idx], upper_limit)) return valid_in_shapes, valid_out_shapes, max_fmap_counts
def caffenet(netconf, netmode): # Start Caffe proto net net = caffe.NetSpec() # Specify input data structures dims = len(netconf.input_shape) run_shape = RunShape(None, None) run_shape.shape = netconf.input_shape run_shape.dilation = [1 for i in range(0, dims)] run_shape.fmaps = 1 run_shape_in = [run_shape] run_shape_out = run_shape_in offsets = [0, 0, 0] offsets[0] = (netconf.input_shape3d[-3] - netconf.output_shape3d[-3]) / 2 - 1 offsets[1] = (netconf.input_shape3d[-2] - netconf.output_shape3d[-2]) / 2 - 1 offsets[2] = (netconf.input_shape3d[-1] - netconf.output_shape3d[-1]) / 2 - 1 sizes = netconf.output_shape3d param = {"offsets": offsets, "sizes": sizes} param_json = json.dumps(param) if netmode == caffe_pb2.TEST: netgen = NetworkGenerator(netconf, netmode) net.data, net.datai = netgen.data_layer([1] + [netconf.fmap_input] + netconf.input_shape3d) net.silence = L.Silence(net.datai, ntop=0) # Chained blob list to construct the network (forward direction) blobs = [] # All networks start with data #blobs = blobs + [net.data] #blobs, run_shape_out = netgen.implement_usknet(netconf, net, run_shape_out, blobs, netconf.fmap_start, netconf.fmap_output) net_blobs, loss_flag = implement_parallel_unets( netconf, netgen, net, netmode) blobs = blobs + net_blobs last_blob = blobs[-1] # Implement the prediction layer if netconf.loss_function == 'malis': net.prob = L.Sigmoid(last_blob, ntop=1) if netconf.loss_function == 'euclid': net.prob = L.Sigmoid(last_blob, ntop=1) if netconf.loss_function == 'softmax': net.prob = L.Softmax(last_blob, ntop=1) else: netgen = NetworkGenerator(netconf, netmode) net.data, net.datai = netgen.data_layer([1] + [netconf.fmap_input] + netconf.input_shape3d) if netconf.loss_function == 'malis': net.label, net.labeli = netgen.data_layer([1] + [netconf.fmap_output] + netconf.output_shape) net.components, net.componentsi = netgen.data_layer( [1, 1] + netconf.output_shape) net.nhood, net.nhoodi = netgen.data_layer([1, 1] + [netconf.fmap_output] + [3]) net.silence = L.Silence(net.datai, net.labeli, net.componentsi, net.nhoodi, ntop=0) if netconf.loss_function == 'euclid': net.label, net.labeli = netgen.data_layer([1] + [netconf.fmap_output3d] + netconf.output_shape3d) net.scale, net.scalei = netgen.data_layer([1] + [netconf.fmap_output3d] + netconf.output_shape3d) net.silence = L.Silence(net.datai, net.labeli, net.scalei, ntop=0) if netconf.loss_function == 'softmax': # net.label, net.labeli = netgen.data_layer([1]+[netconf.fmap_output]+netconf.output_shape) net.label, net.labeli = netgen.data_layer([1] + [1] + netconf.output_shape) net.silence = L.Silence(net.datai, net.labeli, ntop=0) # Start the actual network # Chained blob list to construct the network (forward direction) blobs = [] # All networks start with data #blobs = blobs + [net.data] net_blobs, loss_flag = implement_parallel_unets( netconf, netgen, net, netmode) blobs = blobs + net_blobs last_blob = blobs[-1] # Implement the loss if netconf.loss_function == 'malis': last_blob = L.Sigmoid(last_blob, in_place=True) net.loss = L.MalisLoss(last_blob, net.label, net.components, net.nhood, ntop=0) if netconf.loss_function == 'euclid': last_blob = L.Sigmoid(last_blob, in_place=True) #net.loss = L.EuclideanLoss(last_blob, net.label, net.scale, ntop=0) net.loss = L.GatedEuclideanLoss(last_blob, net.label, net.scale, loss_flag, ntop=0) if netconf.loss_function == 'softmax': net.loss = L.SoftmaxWithLoss(last_blob, net.label, ntop=0) # Return the protocol buffer of the generated network return net.to_proto()
def generator_proto(mode, batchsize, T, exp_T, question_vocab_size, exp_vocab_size, use_gt=True): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) n.data, n.cont, n.img_feature, n.label, n.exp, n.exp_out, n.exp_cont_1, n.exp_cont_2 = \ L.Python(module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=8) n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='uniform',min=-0.08,max=0.08), param=fixed_weights) n.embed = L.TanH(n.embed_ba) # LSTM1 n.lstm1 = L.LSTM(\ n.embed, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_first'+str(i), tops1[int(i)]) n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) n.lstm1_out = tops1[T-1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3}) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=1024,\ weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ bias_filler=dict(type='constant',value=0)), param=fixed_weights_lstm) tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0}) for i in range(T-1): n.__setattr__('slice_second'+str(i), tops2[int(i)]) n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0)) n.lstm2_out = tops2[T-1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.lstm_12 = L.Concat(*concat_botom) # Tile question feature n.q_emb_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.q_emb_tiled_1 = L.Tile(n.q_emb_resh, axis=2, tiles=14) n.q_emb_resh_tiled = L.Tile(n.q_emb_tiled_1, axis=3, tiles=14) # Embed image feature n.i_emb = L.Convolution(n.img_feature, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) # Eltwise product and normalization n.eltwise = L.Eltwise(n.q_emb_resh_tiled, n.i_emb, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise_sqrt = L.SignedSqrt(n.eltwise) n.eltwise_l2 = L.L2Normalize(n.eltwise_sqrt) n.eltwise_drop = L.Dropout(n.eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for VQA n.att_conv1 = L.Convolution(n.eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_conv1_relu = L.ReLU(n.att_conv1) n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier'), param=fixed_weights) n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.att_softmax = L.Softmax(n.att_reshaped, axis=2) n.att_map = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.att_feature = L.SoftAttention(n.img_feature, n.att_map, dummy) n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,2048]))) # eltwise product + normalization again for VQA n.i_emb2 = L.InnerProduct(n.att_feature_resh, num_output=2048, weight_filler=dict(type='xavier'), param=fixed_weights) n.eltwise2 = L.Eltwise(n.lstm_12, n.i_emb2, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise2_sqrt = L.SignedSqrt(n.eltwise2) n.eltwise2_l2 = L.L2Normalize(n.eltwise2_sqrt) n.eltwise2_drop = L.Dropout(n.eltwise2_l2, dropout_param={'dropout_ratio': 0.3}) n.prediction = L.InnerProduct(n.eltwise2_drop, num_output=3000, weight_filler=dict(type='xavier'), param=fixed_weights) # Take GT answer or Take the logits of the VQA model and get predicted answer to embed if use_gt: n.exp_emb_ans = L.Embed(n.label, input_dim=3000, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) else: n.vqa_ans = L.ArgMax(n.prediction, axis=1) n.exp_emb_ans = L.Embed(n.vqa_ans, input_dim=3000, num_output=300, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) n.exp_emb_ans_tanh = L.TanH(n.exp_emb_ans) n.exp_emb_ans2 = L.InnerProduct(n.exp_emb_ans_tanh, num_output=2048, weight_filler=dict(type='xavier')) # Merge VQA answer and visual+textual feature n.exp_emb_resh = L.Reshape(n.exp_emb_ans2, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) n.exp_emb_tiled_1 = L.Tile(n.exp_emb_resh, axis=2, tiles=14) n.exp_emb_tiled = L.Tile(n.exp_emb_tiled_1, axis=3, tiles=14) #n.exp_eltwise = L.Eltwise(n.eltwise_drop, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.eltwise_emb = L.Convolution(n.eltwise, kernel_size=1, stride=1, num_output=2048, pad=0, weight_filler=dict(type='xavier')) n.exp_eltwise = L.Eltwise(n.eltwise_emb, n.exp_emb_tiled, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_eltwise_sqrt = L.SignedSqrt(n.exp_eltwise) n.exp_eltwise_l2 = L.L2Normalize(n.exp_eltwise_sqrt) n.exp_eltwise_drop = L.Dropout(n.exp_eltwise_l2, dropout_param={'dropout_ratio': 0.3}) # Attention for Explanation n.exp_att_conv1 = L.Convolution(n.exp_eltwise_drop, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) n.exp_att_conv1_relu = L.ReLU(n.exp_att_conv1) n.exp_att_conv2 = L.Convolution(n.exp_att_conv1_relu, kernel_size=1, stride=1, num_output=1, pad=0, weight_filler=dict(type='xavier')) n.exp_att_reshaped = L.Reshape(n.exp_att_conv2,reshape_param=dict(shape=dict(dim=[-1,1,14*14]))) n.exp_att_softmax = L.Softmax(n.exp_att_reshaped, axis=2) n.exp_att_map = L.Reshape(n.exp_att_softmax,reshape_param=dict(shape=dict(dim=[-1,1,14,14]))) exp_dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) n.exp_att_feature_prev = L.SoftAttention(n.img_feature, n.exp_att_map, exp_dummy) n.exp_att_feature_resh = L.Reshape(n.exp_att_feature_prev, reshape_param=dict(shape=dict(dim=[-1, 2048]))) n.exp_att_feature_embed = L.InnerProduct(n.exp_att_feature_resh, num_output=2048, weight_filler=dict(type='xavier')) n.exp_lstm12_embed = L.InnerProduct(n.lstm_12, num_output=2048, weight_filler=dict(type='xavier')) n.exp_eltwise2 = L.Eltwise(n.exp_lstm12_embed, n.exp_att_feature_embed, eltwise_param={'operation': P.Eltwise.PROD}) n.exp_att_feature = L.Eltwise(n.exp_emb_ans2, n.exp_eltwise2, eltwise_param={'operation': P.Eltwise.PROD}) n.silence_exp_att = L.Silence(n.exp_att_feature, ntop=0) return n.to_proto()
def SsdDetector(net, train=True, data_layer="data", gt_label="label", \ net_width=300, net_height=300, basenet="VGG", \ visualize=False, extra_data="data", eval_enable=True, **ssdparam): """ 创建SSD检测器。 train: TRAIN /TEST data_layer/gt_label: 数据输入和label输入。 net_width/net_height: 网络的输入尺寸 num_classes: 估计分类的数量。 basenet: "vgg"/"res101",特征网络 ssdparam: ssd检测器使用的参数列表。 返回:整个SSD检测器网络。 """ # BaseNetWork if basenet == "VGG": net = VGG16Net(net, from_layer=data_layer, fully_conv=True, reduced=True, \ dilated=True, dropout=False) base_feature_layers = ['conv4_3', 'fc7'] add_layers = 3 first_channels = 256 second_channels = 512 elif basenet == "Res101": net = ResNet101Net(net, from_layer=data_layer, use_pool5=False) # 1/8, 1/16, 1/32 base_feature_layers = ['res3b3', 'res4b22', 'res5c'] add_layers = 2 first_channels = 256 second_channels = 512 elif basenet == "Res50": net = ResNet50Net(net, from_layer=data_layer, use_pool5=False) base_feature_layers = ['res3d', 'res4f', 'res5c'] add_layers = 2 first_channels = 256 second_channels = 512 elif basenet == "PVA": net = PvaNet(net, from_layer=data_layer) # 1/8, 1/16, 1/32 base_feature_layers = [ 'conv4_1/incep/pre', 'conv5_1/incep/pre', 'conv5_4' ] add_layers = 2 first_channels = 256 second_channels = 512 elif basenet == "Yolo": net = YoloNet(net, from_layer=data_layer) base_feature_layers = ssdparam.get("multilayers_feature_map", []) # add_layers = 2 # first_channels = 256 # second_channels = 512 feature_layers = base_feature_layers else: raise ValueError( "only VGG16, Res50/101 and PVANet are supported in current version." ) result = [] for item in feature_layers: if len(item) == 1: result.append(item[0]) continue name = "" for layers in item: name += layers tags = ["Down", "Ref"] down_methods = [["Reorg"]] UnifiedMultiScaleLayers(net,layers=item, tags=tags, \ unifiedlayer=name, dnsampleMethod=down_methods) result.append(name) feature_layers = result # Add extra layers # extralayers_use_batchnorm=True, extralayers_lr_mult=1, \ # net, feature_layers = AddSsdExtraConvLayers(net, \ # use_batchnorm=ssdparam.get("extralayers_use_batchnorm",False), \ # feature_layers=base_feature_layers, add_layers=add_layers, \ # first_channels=first_channels, second_channels=second_channels) # create ssd detector deader mbox_layers = SsdDetectorHeaders(net, \ min_ratio=ssdparam.get("multilayers_min_ratio",15), \ max_ratio=ssdparam.get("multilayers_max_ratio",90), \ boxsizes=ssdparam.get("multilayers_boxsizes", []), \ net_width=net_width, \ net_height=net_height, \ data_layer=data_layer, \ num_classes=ssdparam.get("num_classes",2), \ from_layers=feature_layers, \ use_batchnorm=ssdparam.get("multilayers_use_batchnorm",True), \ prior_variance = ssdparam.get("multilayers_prior_variance",[0.1,0.1,0.2,0.2]), \ normalizations=ssdparam.get("multilayers_normalizations",[]), \ aspect_ratios=ssdparam.get("multilayers_aspect_ratios",[]), \ flip=ssdparam.get("multilayers_flip",True), \ clip=ssdparam.get("multilayers_clip",False), \ inter_layer_channels=ssdparam.get("multilayers_inter_layer_channels",[]), \ kernel_size=ssdparam.get("multilayers_kernel_size",3), \ pad=ssdparam.get("multilayers_pad",1)) if train == True: loss_param = get_loss_param(normalization=ssdparam.get( "multiloss_normalization", P.Loss.VALID)) mbox_layers.append(net[gt_label]) # create loss if not ssdparam["combine_yolo_ssd"]: multiboxloss_param = get_multiboxloss_param( \ loc_loss_type=ssdparam.get("multiloss_loc_loss_type",P.MultiBoxLoss.SMOOTH_L1), \ conf_loss_type=ssdparam.get("multiloss_conf_loss_type",P.MultiBoxLoss.SOFTMAX), \ loc_weight=ssdparam.get("multiloss_loc_weight",1), \ conf_weight=ssdparam.get("multiloss_conf_weight",1), \ num_classes=ssdparam.get("num_classes",2), \ share_location=ssdparam.get("multiloss_share_location",True), \ match_type=ssdparam.get("multiloss_match_type",P.MultiBoxLoss.PER_PREDICTION), \ overlap_threshold=ssdparam.get("multiloss_overlap_threshold",0.5), \ use_prior_for_matching=ssdparam.get("multiloss_use_prior_for_matching",True), \ background_label_id=ssdparam.get("multiloss_background_label_id",0), \ use_difficult_gt=ssdparam.get("multiloss_use_difficult_gt",False), \ do_neg_mining=ssdparam.get("multiloss_do_neg_mining",True), \ neg_pos_ratio=ssdparam.get("multiloss_neg_pos_ratio",3), \ neg_overlap=ssdparam.get("multiloss_neg_overlap",0.5), \ code_type=ssdparam.get("multiloss_code_type",P.PriorBox.CENTER_SIZE), \ encode_variance_in_target=ssdparam.get("multiloss_encode_variance_in_target",False), \ map_object_to_agnostic=ssdparam.get("multiloss_map_object_to_agnostic",False), \ name_to_label_file=ssdparam.get("multiloss_name_to_label_file","")) net["mbox_loss"] = L.MultiBoxLoss(*mbox_layers, \ multibox_loss_param=multiboxloss_param, \ loss_param=loss_param, \ include=dict(phase=caffe_pb2.Phase.Value('TRAIN')), \ propagate_down=[True, True, False, False]) else: multimcboxloss_param = get_multimcboxloss_param( \ loc_loss_type=ssdparam.get("multiloss_loc_loss_type",P.MultiBoxLoss.SMOOTH_L1), \ loc_weight=ssdparam.get("multiloss_loc_weight",1), \ conf_weight=ssdparam.get("multiloss_conf_weight",1), \ num_classes=ssdparam.get("num_classes",2), \ share_location=ssdparam.get("multiloss_share_location",True), \ match_type=ssdparam.get("multiloss_match_type",P.MultiBoxLoss.PER_PREDICTION), \ overlap_threshold=ssdparam.get("multiloss_overlap_threshold",0.5), \ use_prior_for_matching=ssdparam.get("multiloss_use_prior_for_matching",True), \ background_label_id=ssdparam.get("multiloss_background_label_id",0), \ use_difficult_gt=ssdparam.get("multiloss_use_difficult_gt",False), \ do_neg_mining=ssdparam.get("multiloss_do_neg_mining",True), \ neg_pos_ratio=ssdparam.get("multiloss_neg_pos_ratio",3), \ neg_overlap=ssdparam.get("multiloss_neg_overlap",0.5), \ code_type=ssdparam.get("multiloss_code_type",P.PriorBox.CENTER_SIZE), \ encode_variance_in_target=ssdparam.get("multiloss_encode_variance_in_target",False), \ map_object_to_agnostic=ssdparam.get("multiloss_map_object_to_agnostic",False), \ name_to_label_file=ssdparam.get("multiloss_name_to_label_file",""),\ rescore=ssdparam.get("multiloss_rescore",True),\ object_scale=ssdparam.get("multiloss_object_scale",1),\ noobject_scale=ssdparam.get("multiloss_noobject_scale",1),\ class_scale=ssdparam.get("multiloss_class_scale",1),\ loc_scale=ssdparam.get("multiloss_loc_scale",1)) net["mbox_loss"] = L.MultiMcBoxLoss(*mbox_layers, \ multimcbox_loss_param=multimcboxloss_param, \ loss_param=loss_param, \ include=dict(phase=caffe_pb2.Phase.Value('TRAIN')), \ propagate_down=[True, True, False, False]) return net else: # create conf softmax layer # mbox_layers[1] if not ssdparam["combine_yolo_ssd"]: if ssdparam.get("multiloss_conf_loss_type", P.MultiBoxLoss.SOFTMAX) == P.MultiBoxLoss.SOFTMAX: reshape_name = "mbox_conf_reshape" net[reshape_name] = L.Reshape(mbox_layers[1], \ shape=dict(dim=[0, -1, ssdparam.get("num_classes",2)])) softmax_name = "mbox_conf_softmax" net[softmax_name] = L.Softmax(net[reshape_name], axis=2) flatten_name = "mbox_conf_flatten" net[flatten_name] = L.Flatten(net[softmax_name], axis=1) mbox_layers[1] = net[flatten_name] elif ssdparam.get( "multiloss_conf_loss_type", P.MultiBoxLoss.SOFTMAX) == P.MultiBoxLoss.LOGISTIC: sigmoid_name = "mbox_conf_sigmoid" net[sigmoid_name] = L.Sigmoid(mbox_layers[1]) mbox_layers[1] = net[sigmoid_name] else: raise ValueError("Unknown conf loss type.") det_out_param = get_detection_out_param( \ num_classes=ssdparam.get("num_classes",2), \ share_location=ssdparam.get("multiloss_share_location",True), \ background_label_id=ssdparam.get("multiloss_background_label_id",0), \ code_type=ssdparam.get("multiloss_code_type",P.PriorBox.CENTER_SIZE), \ variance_encoded_in_target=ssdparam.get("multiloss_encode_variance_in_target",False), \ conf_threshold=ssdparam.get("detectionout_conf_threshold",0.01), \ nms_threshold=ssdparam.get("detectionout_nms_threshold",0.45), \ boxsize_threshold=ssdparam.get("detectionout_boxsize_threshold",0.001), \ top_k=ssdparam.get("detectionout_top_k",30), \ visualize=ssdparam.get("detectionout_visualize",False), \ visual_conf_threshold=ssdparam.get("detectionout_visualize_conf_threshold", 0.5), \ visual_size_threshold=ssdparam.get("detectionout_visualize_size_threshold", 0), \ display_maxsize=ssdparam.get("detectionout_display_maxsize",1000), \ line_width=ssdparam.get("detectionout_line_width",4), \ color=ssdparam.get("detectionout_color",[[0,255,0],])) if visualize: mbox_layers.append(net[extra_data]) if not ssdparam["combine_yolo_ssd"]: net.detection_out = L.DetectionOutput(*mbox_layers, \ detection_output_param=det_out_param, \ include=dict(phase=caffe_pb2.Phase.Value('TEST'))) else: net.detection_out = L.DetectionMultiMcOutput(*mbox_layers, \ detection_output_param=det_out_param, \ include=dict(phase=caffe_pb2.Phase.Value('TEST'))) if not visualize and eval_enable: # create eval layer det_eval_param = get_detection_eval_param( \ num_classes=ssdparam.get("num_classes",2), \ background_label_id=ssdparam.get("multiloss_background_label_id",0), \ evaluate_difficult_gt=ssdparam.get("detectioneval_evaluate_difficult_gt",False), \ boxsize_threshold=ssdparam.get("detectioneval_boxsize_threshold",[0,0.01,0.05,0.1,0.15,0.2,0.25]), \ iou_threshold=ssdparam.get("detectioneval_iou_threshold",[0.9,0.75,0.5]), \ name_size_file=ssdparam.get("detectioneval_name_size_file","")) net.detection_eval = L.DetectionEvaluate(net.detection_out, net[gt_label], \ detection_evaluate_param=det_eval_param, \ include=dict(phase=caffe_pb2.Phase.Value('TEST'))) if not eval_enable: net.slience = L.Silence(net.detection_out, ntop=0, \ include=dict(phase=caffe_pb2.Phase.Value('TEST'))) return net
def generate_model(split, config): n = caffe.NetSpec() batch_size = config.N mode_str = str(dict(split=split, batch_size=batch_size)) n.language, n.cont, n.image, n.spatial, n.label = L.Python(module=config.data_provider, layer=config.data_provider_layer, param_str=mode_str, ntop=5) # the base net (VGG-16) n.conv1_1, n.relu1_1 = conv_relu(n.image, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv1_2, n.relu1_2 = conv_relu(n.relu1_1, 64, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool1 = max_pool(n.relu1_2) n.conv2_1, n.relu2_1 = conv_relu(n.pool1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv2_2, n.relu2_2 = conv_relu(n.relu2_1, 128, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool2 = max_pool(n.relu2_2) n.conv3_1, n.relu3_1 = conv_relu(n.pool2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_2, n.relu3_2 = conv_relu(n.relu3_1, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv3_3, n.relu3_3 = conv_relu(n.relu3_2, 256, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool3 = max_pool(n.relu3_3) n.conv4_1, n.relu4_1 = conv_relu(n.pool3, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_2, n.relu4_2 = conv_relu(n.relu4_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv4_3, n.relu4_3 = conv_relu(n.relu4_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool4 = max_pool(n.relu4_3) n.conv5_1, n.relu5_1 = conv_relu(n.pool4, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_2, n.relu5_2 = conv_relu(n.relu5_1, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.conv5_3, n.relu5_3 = conv_relu(n.relu5_2, 512, fix_param=config.fix_vgg, finetune=(not config.fix_vgg)) n.pool5 = max_pool(n.relu5_3) # fully conv n.fcn_fc6, n.fcn_relu6 = conv_relu(n.pool5, 4096, ks=7, pad=3) if config.vgg_dropout: n.fcn_drop6 = L.Dropout(n.fcn_relu6, dropout_ratio=0.5, in_place=True) n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_drop6, 4096, ks=1, pad=0) n.fcn_drop7 = L.Dropout(n.fcn_relu7, dropout_ratio=0.5, in_place=True) n.fcn_fc8 = conv(n.fcn_drop7, 1000, ks=1, pad=0) else: n.fcn_fc7, n.fcn_relu7 = conv_relu(n.fcn_relu6, 4096, ks=1, pad=0) n.fcn_fc8 = conv(n.fcn_relu7, 1000, ks=1, pad=0) # embedding n.embed = L.Embed(n.language, input_dim=config.vocab_size, num_output=config.embed_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08)) # LSTM n.lstm = L.LSTM(n.embed, n.cont, recurrent_param=dict(num_output=config.lstm_dim, weight_filler=dict(type='uniform', min=-0.08, max=0.08), bias_filler=dict(type='constant', value=0))) tops = L.Slice(n.lstm, ntop=config.T, slice_param=dict(axis=0)) for i in range(config.T - 1): n.__setattr__('slice'+str(i), tops[i]) n.__setattr__('silence'+str(i), L.Silence(tops[i], ntop=0)) n.lstm_out = tops[-1] n.lstm_feat = L.Reshape(n.lstm_out, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim]))) # Tile LSTM feature n.lstm_resh = L.Reshape(n.lstm_feat, reshape_param=dict(shape=dict(dim=[-1, config.lstm_dim, 1, 1]))) n.lstm_tile_1 = L.Tile(n.lstm_resh, axis=2, tiles=config.featmap_H) n.lstm_tile_2 = L.Tile(n.lstm_tile_1, axis=3, tiles=config.featmap_W) # L2 Normalize image and language features n.img_l2norm = L.L2Normalize(n.fcn_fc8) n.lstm_l2norm = L.L2Normalize(n.lstm_tile_2) # Concatenate n.feat_all = L.Concat(n.lstm_l2norm, n.img_l2norm, n.spatial, concat_param=dict(axis=1)) # MLP Classifier over concatenated feature n.fcn_l1, n.fcn_relu1 = conv_relu(n.feat_all, config.mlp_hidden_dims, ks=1, pad=0) if config.mlp_dropout: n.fcn_drop1 = L.Dropout(n.fcn_relu1, dropout_ratio=0.5, in_place=True) n.fcn_scores = conv(n.fcn_drop1, 1, ks=1, pad=0) else: n.fcn_scores = conv(n.fcn_relu1, 1, ks=1, pad=0) # Loss Layer n.loss = L.SigmoidCrossEntropyLoss(n.fcn_scores, n.label) return n.to_proto()
def setLayers_twoBranches(data_source, batch_size, layername, kernel, stride, outCH, label_name, transform_param_in, deploy=False, batchnorm=0, lr_mult_distro=[1, 1, 1]): # it is tricky to produce the deploy prototxt file, as the data input is not from a layer, so we have to creat a workaround # producing training and testing prototxt files is pretty straight forward n = caffe.NetSpec() assert len(layername) == len(kernel) assert len(layername) == len(stride) assert len(layername) == len(outCH) num_parts = transform_param['num_parts'] if deploy == False and "lmdb" not in data_source: if (len(label_name) == 1): n.data, n.tops[label_name[0]] = L.HDF5Data(hdf5_data_param=dict( batch_size=batch_size, source=data_source), ntop=2) elif (len(label_name) == 2): n.data, n.tops[label_name[0]], n.tops[label_name[1]] = L.HDF5Data( hdf5_data_param=dict(batch_size=batch_size, source=data_source), ntop=3) # produce data definition for deploy net elif deploy == False: n.data, n.tops['label'] = L.CPMData( data_param=dict(backend=1, source=data_source, batch_size=batch_size), cpm_transform_param=transform_param_in, ntop=2) n.tops[label_name[2]], n.tops[label_name[3]], n.tops[ label_name[4]], n.tops[label_name[5]] = L.Slice( n.label, slice_param=dict( axis=1, slice_point=[38, num_parts + 1, num_parts + 39]), ntop=4) n.tops[label_name[0]] = L.Eltwise(n.tops[label_name[2]], n.tops[label_name[4]], operation=P.Eltwise.PROD) n.tops[label_name[1]] = L.Eltwise(n.tops[label_name[3]], n.tops[label_name[5]], operation=P.Eltwise.PROD) else: input = "data" dim1 = 1 dim2 = 4 dim3 = 368 dim4 = 368 # make an empty "data" layer so the next layer accepting input will be able to take the correct blob name "data", # we will later have to remove this layer from the serialization string, since this is just a placeholder n.data = L.Layer() # something special before everything n.image, n.center_map = L.Slice(n.data, slice_param=dict(axis=1, slice_point=3), ntop=2) n.silence2 = L.Silence(n.center_map, ntop=0) #n.pool_center_lower = L.Pooling(n.center_map, kernel_size=9, stride=8, pool=P.Pooling.AVE) # just follow arrays..CPCPCPCPCCCC.... last_layer = ['image', 'image'] stage = 1 conv_counter = 1 pool_counter = 1 drop_counter = 1 local_counter = 1 state = 'image' # can be image or fuse share_point = 0 for l in range(0, len(layername)): if layername[l] == 'V': #pretrained VGG layers conv_name = 'conv%d_%d' % (pool_counter, local_counter) lr_m = lr_mult_distro[0] n.tops[conv_name] = L.Convolution( n.tops[last_layer[0]], kernel_size=kernel[l], num_output=outCH[l], pad=int(math.floor(kernel[l] / 2)), param=[ dict(lr_mult=lr_m, decay_mult=1), dict(lr_mult=lr_m * 2, decay_mult=0) ], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant')) last_layer[0] = conv_name last_layer[1] = conv_name print '%s\tch=%d\t%.1f' % (last_layer[0], outCH[l], lr_m) ReLUname = 'relu%d_%d' % (pool_counter, local_counter) n.tops[ReLUname] = L.ReLU(n.tops[last_layer[0]], in_place=True) local_counter += 1 print ReLUname if layername[l] == 'B': pool_counter += 1 local_counter = 1 if layername[l] == 'C': if state == 'image': #conv_name = 'conv%d_stage%d' % (conv_counter, stage) conv_name = 'conv%d_%d_CPM' % ( pool_counter, local_counter ) # no image state in subsequent stages if stage == 1: lr_m = lr_mult_distro[1] else: lr_m = lr_mult_distro[1] else: # fuse conv_name = 'Mconv%d_stage%d' % (conv_counter, stage) lr_m = lr_mult_distro[2] conv_counter += 1 #if stage == 1: # lr_m = 1 #else: # lr_m = lr_sub n.tops[conv_name] = L.Convolution( n.tops[last_layer[0]], kernel_size=kernel[l], num_output=outCH[l], pad=int(math.floor(kernel[l] / 2)), param=[ dict(lr_mult=lr_m, decay_mult=1), dict(lr_mult=lr_m * 2, decay_mult=0) ], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant')) last_layer[0] = conv_name last_layer[1] = conv_name print '%s\tch=%d\t%.1f' % (last_layer[0], outCH[l], lr_m) if layername[l + 1] != 'L': if (state == 'image'): if (batchnorm == 1): batchnorm_name = 'bn%d_stage%d' % (conv_counter, stage) n.tops[batchnorm_name] = L.BatchNorm( n.tops[last_layer[0]], param=[ dict(lr_mult=0), dict(lr_mult=0), dict(lr_mult=0) ]) #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001)) last_layer[0] = batchnorm_name #ReLUname = 'relu%d_stage%d' % (conv_counter, stage) ReLUname = 'relu%d_%d_CPM' % (pool_counter, local_counter) n.tops[ReLUname] = L.ReLU(n.tops[last_layer[0]], in_place=True) else: if (batchnorm == 1): batchnorm_name = 'Mbn%d_stage%d' % (conv_counter, stage) n.tops[batchnorm_name] = L.BatchNorm( n.tops[last_layer[0]], param=[ dict(lr_mult=0), dict(lr_mult=0), dict(lr_mult=0) ]) #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001)) last_layer[0] = batchnorm_name ReLUname = 'Mrelu%d_stage%d' % (conv_counter, stage) n.tops[ReLUname] = L.ReLU(n.tops[last_layer[0]], in_place=True) #last_layer = ReLUname print ReLUname #conv_counter += 1 local_counter += 1 elif layername[l] == 'C2': for level in range(0, 2): if state == 'image': #conv_name = 'conv%d_stage%d' % (conv_counter, stage) conv_name = 'conv%d_%d_CPM_L%d' % ( pool_counter, local_counter, level + 1 ) # no image state in subsequent stages if stage == 1: lr_m = lr_mult_distro[1] else: lr_m = lr_mult_distro[1] else: # fuse conv_name = 'Mconv%d_stage%d_L%d' % (conv_counter, stage, level + 1) lr_m = lr_mult_distro[2] #conv_counter += 1 #if stage == 1: # lr_m = 1 #else: # lr_m = lr_sub if layername[l + 1] == 'L2' or layername[l + 1] == 'L3': if level == 0: outCH[l] = 38 else: outCH[l] = 19 n.tops[conv_name] = L.Convolution( n.tops[last_layer[level]], kernel_size=kernel[l], num_output=outCH[l], pad=int(math.floor(kernel[l] / 2)), param=[ dict(lr_mult=lr_m, decay_mult=1), dict(lr_mult=lr_m * 2, decay_mult=0) ], weight_filler=dict(type='gaussian', std=0.01), bias_filler=dict(type='constant')) last_layer[level] = conv_name print '%s\tch=%d\t%.1f' % (last_layer[level], outCH[l], lr_m) if layername[l + 1] != 'L2' and layername[l + 1] != 'L3': if (state == 'image'): if (batchnorm == 1): batchnorm_name = 'bn%d_stage%d_L%d' % ( conv_counter, stage, level + 1) n.tops[batchnorm_name] = L.BatchNorm( n.tops[last_layer[level]], param=[ dict(lr_mult=0), dict(lr_mult=0), dict(lr_mult=0) ]) #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001)) last_layer[level] = batchnorm_name #ReLUname = 'relu%d_stage%d' % (conv_counter, stage) ReLUname = 'relu%d_%d_CPM_L%d' % ( pool_counter, local_counter, level + 1) n.tops[ReLUname] = L.ReLU(n.tops[last_layer[level]], in_place=True) else: if (batchnorm == 1): batchnorm_name = 'Mbn%d_stage%d_L%d' % ( conv_counter, stage, level + 1) n.tops[batchnorm_name] = L.BatchNorm( n.tops[last_layer[level]], param=[ dict(lr_mult=0), dict(lr_mult=0), dict(lr_mult=0) ]) #scale_filler=dict(type='constant', value=1), shift_filler=dict(type='constant', value=0.001)) last_layer[level] = batchnorm_name ReLUname = 'Mrelu%d_stage%d_L%d' % (conv_counter, stage, level + 1) n.tops[ReLUname] = L.ReLU(n.tops[last_layer[level]], in_place=True) print ReLUname conv_counter += 1 local_counter += 1 elif layername[l] == 'P': # Pooling n.tops['pool%d_stage%d' % (pool_counter, stage)] = L.Pooling( n.tops[last_layer[0]], kernel_size=kernel[l], stride=stride[l], pool=P.Pooling.MAX) last_layer[0] = 'pool%d_stage%d' % (pool_counter, stage) pool_counter += 1 local_counter = 1 conv_counter += 1 print last_layer[0] elif layername[l] == 'L': # Loss: n.loss layer is only in training and testing nets, but not in deploy net. if deploy == False and "lmdb" not in data_source: n.tops['map_vec_stage%d' % stage] = L.Flatten( n.tops[last_layer[0]]) n.tops['loss_stage%d' % stage] = L.EuclideanLoss( n.tops['map_vec_stage%d' % stage], n.tops[label_name[1]]) elif deploy == False: level = 1 name = 'weight_stage%d' % stage n.tops[name] = L.Eltwise(n.tops[last_layer[level]], n.tops[label_name[(level + 2)]], operation=P.Eltwise.PROD) n.tops['loss_stage%d' % stage] = L.EuclideanLoss( n.tops[name], n.tops[label_name[level]]) print 'loss %d' % stage stage += 1 conv_counter = 1 pool_counter = 1 drop_counter = 1 local_counter = 1 state = 'image' elif layername[l] == 'L2': # Loss: n.loss layer is only in training and testing nets, but not in deploy net. weight = [lr_mult_distro[3], 1] # print lr_mult_distro[3] for level in range(0, 2): if deploy == False and "lmdb" not in data_source: n.tops['map_vec_stage%d_L%d' % (stage, level + 1)] = L.Flatten( n.tops[last_layer[level]]) n.tops['loss_stage%d_L%d' % (stage, level + 1)] = L.EuclideanLoss( n.tops['map_vec_stage%d' % stage], n.tops[label_name[level]], loss_weight=weight[level]) elif deploy == False: name = 'weight_stage%d_L%d' % (stage, level + 1) n.tops[name] = L.Eltwise(n.tops[last_layer[level]], n.tops[label_name[(level + 2)]], operation=P.Eltwise.PROD) n.tops['loss_stage%d_L%d' % (stage, level + 1)] = L.EuclideanLoss( n.tops[name], n.tops[label_name[level]], loss_weight=weight[level]) print 'loss %d level %d' % (stage, level + 1) stage += 1 #last_connect = last_layer #last_layer = 'image' conv_counter = 1 pool_counter = 1 drop_counter = 1 local_counter = 1 state = 'image' elif layername[l] == 'L3': # Loss: n.loss layer is only in training and testing nets, but not in deploy net. weight = [lr_mult_distro[3], 1] # print lr_mult_distro[3] if deploy == False: level = 0 n.tops['loss_stage%d_L%d' % (stage, level + 1)] = L.Euclidean2Loss( n.tops[last_layer[level]], n.tops[label_name[level]], n.tops[label_name[2]], loss_weight=weight[level]) print 'loss %d level %d' % (stage, level + 1) level = 1 n.tops['loss_stage%d_L%d' % (stage, level + 1)] = L.EuclideanLoss( n.tops[last_layer[level]], n.tops[label_name[level]], loss_weight=weight[level]) print 'loss %d level %d' % (stage, level + 1) stage += 1 #last_connect = last_layer #last_layer = 'image' conv_counter = 1 pool_counter = 1 drop_counter = 1 local_counter = 1 state = 'image' elif layername[l] == 'D': if deploy == False: n.tops['drop%d_stage%d' % (drop_counter, stage)] = L.Dropout( n.tops[last_layer[0]], in_place=True, dropout_param=dict(dropout_ratio=0.5)) drop_counter += 1 elif layername[l] == '@': #if not share_point: # share_point = last_layer n.tops['concat_stage%d' % stage] = L.Concat( n.tops[last_layer[0]], n.tops[last_layer[1]], n.tops[share_point], concat_param=dict(axis=1)) local_counter = 1 state = 'fuse' last_layer[0] = 'concat_stage%d' % stage last_layer[1] = 'concat_stage%d' % stage print last_layer elif layername[l] == '$': share_point = last_layer[0] pool_counter += 1 local_counter = 1 print 'share' # final process stage -= 1 #if stage == 1: # n.silence = L.Silence(n.pool_center_lower, ntop=0) if deploy == False: return str(n.to_proto()) # for generating the deploy net else: # generate the input information header string deploy_str = 'input: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}\ninput_dim: {}'.format( '"' + input + '"', dim1, dim2, dim3, dim4) # assemble the input header with the net layers string. remove the first placeholder layer from the net string. return deploy_str + '\n' + 'layer {' + 'layer {'.join( str(n.to_proto()).split('layer {')[2:])
def mfh_baseline(mode, batchsize, T, question_vocab_size, folder): n = caffe.NetSpec() mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder}) if mode == 'val': n.data, n.cont, n.img_feature, n.label = L.Python( \ module='vqa_data_layer', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=4 ) else: n.data, n.cont, n.img_feature, n.label = L.Python(\ module='vqa_data_layer_kld', layer='VQADataProviderLayer', \ param_str=mode_str, ntop=4 ) n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ weight_filler=dict(type='xavier')) n.embed_tanh = L.TanH(n.embed) # LSTM #n.lstm1 = L.LSTM(\ # n.embed_tanh, n.cont,\ # recurrent_param=dict(\ # num_output=config.LSTM_UNIT_NUM,\ # weight_filler=dict(type='xavier'))) #tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) #for i in xrange(config.MAX_WORDS_IN_QUESTION-1): # n.__setattr__('slice_first'+str(i), tops1[int(i)]) # n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) #n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1] #n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ # reshape_param=dict(\ # shape=dict(dim=[-1,1024]))) #n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) n.lstm1 = L.LSTM(\ n.embed, n.cont,\ recurrent_param=dict(\ num_output=config.LSTM_UNIT_NUM,\ weight_filler=dict(type='xavier'))) tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) for i in xrange(config.MAX_WORDS_IN_QUESTION-1): n.__setattr__('slice_first'+str(i), tops1[int(i)]) n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1] n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) # LSTM2 n.lstm2 = L.LSTM(\ n.lstm1_droped, n.cont,\ recurrent_param=dict(\ num_output=config.LSTM_UNIT_NUM, weight_filler=dict(type='xavier'))) tops2 = L.Slice(n.lstm2, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) for i in xrange(config.MAX_WORDS_IN_QUESTION-1): n.__setattr__('slice_second'+str(i), tops2[int(i)]) n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0)) n.lstm2_out = tops2[config.MAX_WORDS_IN_QUESTION-1] n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ reshape_param=dict(\ shape=dict(dim=[-1,1024]))) n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] n.q_feat = L.Concat(*concat_botom) ''' Coarse Image-Question MFH fusion ''' n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0)) n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\ reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out) n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt) n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, weight_filler=dict(type='xavier')) n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0)) n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \ pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\ reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out) n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt) n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2) n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS, weight_filler=dict(type='xavier')) if mode == 'val': n.loss = L.SoftmaxWithLoss(n.prediction, n.label) else: n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) return n.to_proto()