def build(self): ''' Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. ''' # Inputs input_image = KL.Input(shape=[None, None, self.cfg.IMAGE.NB_CHANNELS], name="input_image") input_image_meta = KL.Input(shape=[self.cfg.IMAGE_META_SIZE], name="input_image_meta") if self.mode == 'training': # RPN GT input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = KL.Lambda(lambda x: gutils.norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_gt_boxes) elif self.mode == 'inference': # Anchors in normalized coordinates input_anchors = KL.Input(shape=[None, 4], name="input_anchors") # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. if callable(self.cfg.ARCHI.BACKBONE): _, C2, C3, C4, C5 = self.cfg.ARCHI.BACKBONE( input_image, stage5=True, train_bn=self.cfg.ARCHI.TRAIN_BN) else: _, C2, C3, C4, C5 = resnet.resnet_graph( input_image, self.cfg.ARCHI.BACKBONE, stage5=True, train_bn=self.cfg.ARCHI.TRAIN_BN) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config P5 = KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5) P4 = KL.Add(name="fpn_p4add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5), KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4) ]) P3 = KL.Add(name="fpn_p3add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3) ]) P2 = KL.Add(name="fpn_p2add")([ KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. P2 = KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2) P3 = KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3) P4 = KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4) P5 = KL.Conv2D(self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Anchors if self.mode == 'training': anchors = self.get_anchors(self.img_shape) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (self.cfg.BATCH_SIZE, ) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) else: anchors = input_anchors # RPN Model rpn = rpnlib.build_rpn_model(self.cfg.ARCHI.RPN_ANCHOR_STRIDE, len(self.cfg.ARCHI.RPN_ANCHOR_RATIOS), self.cfg.ARCHI.TOP_DOWN_PYRAMID_SIZE) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates and zero padded. proposal_count = self.cfg.ARCHI.POST_NMS_ROIS_TRAINING if self.mode == 'training' else self.cfg.ARCHI.POST_NMS_ROIS_INFERENCE rpn_rois = proposal.ProposalLayer( proposal_count=proposal_count, nms_threshold=self.cfg.ARCHI.RPN_NMS_THRESHOLD, name="ROI", config=self.cfg)([rpn_class, rpn_bbox, anchors]) if self.mode == 'training': # Class ID mask to mark class IDs supported by the dataset the image came from. active_class_ids = KL.Lambda(lambda x: meta.parse_image_meta_graph( x)["active_class_ids"])(input_image_meta) if not self.cfg.ARCHI.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = KL.Input( shape=[self.cfg.ARCHI.POST_NMS_ROIS_TRAINING, 4], name='input_roi', dtype=np.int32) # Normalize coordinates target_rois = KL.Lambda(lambda x: gutils.norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox = detection_target.DetectionTargetLayer( self.cfg, name='proposal_targets')( [target_rois, input_gt_class_ids, gt_boxes]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox = fpnlib.fpn_classifier_graph( rois, mrcnn_feature_maps, input_image_meta, self.cfg.ARCHI.POOL_SIZE, self.cfg.DATASET.NB_CLASSES, train_bn=self.cfg.ARCHI.TRAIN_BN, fc_layers_size=self.cfg.ARCHI.FPN_CLASSIF_FC_LAYERS_SIZE) # TODO: clean up (use tf.identify if necessary) output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = KL.Lambda(lambda x: l.rpn_class_loss_graph(*x), name="rpn_class_loss")( [input_rpn_match, rpn_class_logits]) rpn_bbox_loss = KL.Lambda( lambda x: l.rpn_bbox_loss_graph(self.cfg, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) class_loss = KL.Lambda(lambda x: l.mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = KL.Lambda(lambda x: l.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([ target_bbox, target_class_ids, mrcnn_bbox ]) # Model inputs = [ input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes ] if not self.cfg.ARCHI.USE_RPN_ROIS: inputs.append(input_rois) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss ] model = KM.Model(inputs, outputs, name='mask_rcnn') else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = fpnlib.fpn_classifier_graph( rpn_rois, mrcnn_feature_maps, input_image_meta, self.cfg.ARCHI.POOL_SIZE, self.cfg.ARCHI.NB_CLASSES, train_bn=self.cfg.ARCHI.TRAIN_BN, fc_layers_size=self.cfg.ARCHI.FPN_CLASSIF_FC_LAYERS_SIZE) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = detection.DetectionLayer(self.cfg, name="mrcnn_detection")([ rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta ]) model = KM.Model([input_image, input_image_meta, input_anchors], [ detections, mrcnn_class, mrcnn_bbox, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') # Add multi-GPU support. if self.cfg.GPU_COUNT > 1: from mrcnn.parallel_model import ParallelModel model = ParallelModel(model, self.cfg.GPU_COUNT) return model
def build_model(**params): # TODO: get all these from **params CNN = 'resnet' INCLUDE_TOP = False LEARNABLE_CNN_LAYERS = params['learnable_cnn_layers'] RNN_TYPE = 'LSTM' RNN_SIZE = 1024 WORDVEC_SIZE = params['wordvec_size'] ACTIVATION = 'relu' USE_CGRU = params['use_cgru'] CGRU_SIZE = params['cgru_size'] REDUCE_MEAN = params['reduce_visual'] max_words = params['max_words'] if CNN == 'vgg16': cnn = applications.vgg16.VGG16(include_top=INCLUDE_TOP) elif CNN == 'resnet': cnn = applications.resnet50.ResNet50(include_top=INCLUDE_TOP) # Pop the mean pooling layer cnn = models.Model(inputs=cnn.inputs, outputs=cnn.layers[-2].output) for layer in cnn.layers[:-LEARNABLE_CNN_LAYERS]: layer.trainable = False # Context Vector input # normalized to [0,1] the values: # left, top, right, bottom, (box area / image area) input_ctx = layers.Input(shape=(5, )) ctx = layers.BatchNormalization()(input_ctx) repeat_ctx = layers.RepeatVector(max_words)(ctx) # Global Image featuers (convnet output for the whole image) input_img_global = layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)) image_global = cnn(input_img_global) # Add a residual CGRU layer if USE_CGRU: image_global = layers.Conv2D(CGRU_SIZE, (1, 1), padding='same', activation='relu')(image_global) res_cgru = SpatialCGRU(image_global, CGRU_SIZE) image_global = layers.add([image_global, res_cgru]) if REDUCE_MEAN: image_global = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))( image_global) image_global = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))( image_global) else: image_global = layers.Conv2D(WORDVEC_SIZE / 4, (3, 3), activation='relu')(image_global) image_global = layers.Conv2D(WORDVEC_SIZE / 2, (3, 3), activation='relu')(image_global) image_global = layers.Flatten()(image_global) image_global = layers.Concatenate()([image_global, ctx]) image_global = layers.Dense(1024, activation='relu')(image_global) image_global = layers.BatchNormalization()(image_global) image_global = layers.Dense(WORDVEC_SIZE / 2, activation=ACTIVATION)(image_global) image_global = layers.BatchNormalization()(image_global) image_global = layers.RepeatVector(max_words)(image_global) # Local Image featuers (convnet output for just the bounding box) input_img_local = layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)) image_local = cnn(input_img_local) if USE_CGRU: image_local = layers.Conv2D(CGRU_SIZE, (1, 1), padding='same', activation='relu')(image_local) res_cgru = SpatialCGRU(image_local, CGRU_SIZE) image_local = layers.add([image_local, res_cgru]) if REDUCE_MEAN: image_local = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))( image_local) image_local = layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))( image_local) else: image_local = layers.Conv2D(WORDVEC_SIZE / 4, (3, 3), activation='relu')(image_local) image_local = layers.Conv2D(WORDVEC_SIZE / 2, (3, 3), activation='relu')(image_local) image_local = layers.Flatten()(image_local) image_local = layers.Concatenate()([image_local, ctx]) image_local = layers.Dense(1024, activation='relu')(image_local) image_local = layers.BatchNormalization()(image_local) image_local = layers.Dense(WORDVEC_SIZE / 2, activation=ACTIVATION)(image_local) image_local = layers.BatchNormalization()(image_local) image_local = layers.RepeatVector(max_words)(image_local) language_model = models.Sequential() input_words = layers.Input(shape=(max_words, ), dtype='int32') language = layers.Embedding(words.VOCABULARY_SIZE, WORDVEC_SIZE, input_length=max_words)(input_words) x = layers.concatenate([image_global, image_local, repeat_ctx, language]) if RNN_TYPE == 'LSTM': x = layers.LSTM(RNN_SIZE)(x) else: x = layers.GRU(RNN_SIZE)(x) x = layers.BatchNormalization()(x) x = layers.Dense(words.VOCABULARY_SIZE, activation='softmax')(x) return models.Model( inputs=[input_img_global, input_img_local, input_words, input_ctx], outputs=x)
def labels_to_image_model(im_shape, n_channels, crop_shape, label_list, n_neutral_labels, vox2ras, nonlin_shape_factor=0.0625, crop_channel2=None, output_div_by_n=None, flipping=True): # get shapes n_dims, _ = utils.get_dims(im_shape) crop_shape = get_shapes(crop_shape, im_shape, output_div_by_n) deformation_field_size = utils.get_resample_shape(im_shape, nonlin_shape_factor, len(im_shape)) # create new_label_list and corresponding LUT to make sure that labels go from 0 to N-1 new_label_list, lut = utils.rearrange_label_list(label_list) # define mandatory inputs image_input = KL.Input(shape=im_shape + [n_channels], name='image_input') labels_input = KL.Input(shape=im_shape + [1], name='labels_input') aff_in = KL.Input(shape=(n_dims + 1, n_dims + 1), name='aff_input') nonlin_field_in = KL.Input(shape=deformation_field_size, name='nonlin_input') list_inputs = [image_input, labels_input, aff_in, nonlin_field_in] # convert labels to new_label_list labels = KL.Lambda(lambda x: tf.gather( tf.convert_to_tensor(lut, dtype='int32'), tf.cast(x, dtype='int32')))( labels_input) # deform labels image_input._keras_shape = tuple(image_input.get_shape().as_list()) labels._keras_shape = tuple(labels.get_shape().as_list()) labels = KL.Lambda(lambda x: tf.cast(x, dtype='float'))(labels) resize_shape = [ max(int(im_shape[i] / 2), deformation_field_size[i]) for i in range(len(im_shape)) ] nonlin_field = nrn_layers.Resize(size=resize_shape, interp_method='linear')(nonlin_field_in) nonlin_field = nrn_layers.VecInt()(nonlin_field) nonlin_field = nrn_layers.Resize(size=im_shape, interp_method='linear')(nonlin_field) image = nrn_layers.SpatialTransformer(interp_method='linear')( [image_input, aff_in, nonlin_field]) labels = nrn_layers.SpatialTransformer(interp_method='nearest')( [labels, aff_in, nonlin_field]) labels = KL.Lambda(lambda x: tf.cast(x, dtype='int32'))(labels) # cropping if crop_shape is not None: image, crop_idx = l2i_sa.random_cropping(image, crop_shape, n_dims) labels = KL.Lambda( lambda x: tf.slice(x[0], begin=tf.cast(x[1], dtype='int32'), size=tf.convert_to_tensor( [-1] + crop_shape + [-1], dtype='int32')))( [labels, crop_idx]) else: crop_shape = im_shape # flipping if flipping: labels, flip = l2i_sa.label_map_random_flipping( labels, label_list, n_neutral_labels, vox2ras, n_dims) ras_axes = edit_volumes.get_ras_axes(vox2ras, n_dims) flip_axis = [ras_axes[0] + 1] image = KL.Lambda(lambda y: K.switch( y[0], KL.Lambda(lambda x: K.reverse(x, axes=flip_axis))(y[1]), y[1]))( [flip, image]) # convert labels back to original values labels = KL.Lambda( lambda x: tf.gather(tf.convert_to_tensor(label_list, dtype='int32'), tf.cast(x, dtype='int32')), name='labels_out')(labels) # intensity augmentation image = KL.Lambda(lambda x: K.clip(x, 0, 300), name='clipping')(image) # loop over channels if n_channels > 1: split = KL.Lambda(lambda x: tf.split(x, [1] * n_channels, axis=-1))( image) else: split = [image] processed_channels = list() for i, channel in enumerate(split): # normalise and shift intensities image = l2i_ia.min_max_normalisation(image) image = KL.Lambda(lambda x: K.random_uniform( (1, ), .85, 1.1) * x + K.random_uniform((1, ), -.3, .3))(image) image = KL.Lambda(lambda x: K.clip(x, 0, 1))(image) image = l2i_ia.gamma_augmentation(image) # randomly crop sides of second channel if (crop_channel2 is not None) & (channel == 1): image = l2i_sa.restrict_tensor(image, crop_channel2, n_dims) # concatenate all channels back, and clip output (include labels to keep it when plugging to other models) if n_channels > 1: image = KL.concatenate(processed_channels) else: image = processed_channels[0] image = KL.Lambda(lambda x: K.clip(x[0], 0, 1), name='image_out')([image, labels]) # build model brain_model = Model(inputs=list_inputs, outputs=[image, labels]) # shape of returned images output_shape = image.get_shape().as_list()[1:] return brain_model, output_shape
input3 = KL.Input((2, )) #对input1做操作得到temp1 temp1 = KL.BatchNormalization(axis=1)(input1) temp1 = KL.Conv2D(16, (3, 3), padding='same')(temp1) temp1 = KL.Activation('relu')(temp1) temp1 = KL.MaxPooling2D(2)(temp1) temp1 = KL.Flatten()(temp1) temp1 = KL.Dense(2)(temp1) #对input2做操作得到temp2 temp2 = KL.Dense(32)(input2) temp2 = KL.Dense(2)(temp2) #temp1,temp2计算得到loss1 ,通过Lambda自定义层 #对temp1,input3计算得到loss2 loss1 = KL.Lambda(lambda x: custom_loss1(*x), name='loss1')([temp1, temp2]) loss2 = KL.Lambda(lambda x: custom_loss2(*x), name='loss2')([temp1, input3]) #将输入输出放进model中,建立网络 model = Model([input1, input2, input3], [loss1, loss2]) plot_model(model, to_file='model.png', show_shapes=True) #查看model 网络结构 #将自定义的loss层的结果取出作为model的loss loss_layer1 = model.get_layer('loss1').output loss_layer2 = model.get_layer('loss2').output model.add_loss(loss_layer1) model.add_loss(loss_layer2) model.compile(optimizer='sgd', loss=[None, None]) #yield把函数变成一个生成器,逐块将数据载入,而不是一下子全部载入,减小显存占用 def data_gen(num):
def get_model(self, summary=True, num_capsule=32, len_ui=8, len_vj=16, routing=0, init_lr=0.001, l2_constant=0.0, dropout_ratio=0.1, num_classes=10): if routing: use_routing = True else: use_routing = False input_img = layers.Input((28, 28, 1)) input_mask = layers.Input((num_classes, len_vj)) # only use experiment reconstruction input_permutation = layers.Input((num_classes, len_vj)) conv_layer = layers.Conv2D(256, (9, 9), strides=(1, 1), use_bias=True, kernel_regularizer=l2(l2_constant), activation=None)(input_img) conv_layer = layers.Activation('relu')(conv_layer) # convolutional capsule layer h_i = layers.Conv2D(num_capsule * len_ui, kernel_size=(9, 9), strides=(2, 2), padding='valid', use_bias=True, kernel_regularizer=l2(l2_constant), activation=None)(conv_layer) h_i = layers.Reshape((K.int_shape(h_i)[1] * K.int_shape(h_i)[2] * num_capsule, len_ui))(h_i) h_i = layers.Activation('relu')(h_i) # routing algorithm image_caps = Routing(num_capsule=num_classes, l2_constant=l2_constant, dim_capsule=len_vj, routing=use_routing, num_routing=3)(h_i) output = CapsuleNorm(name='pred_output')(image_caps) # reconstruction # mask_output : [B, Num Classes, len_vj] mask_output = layers.Multiply()([image_caps, input_mask]) mask_output = layers.Add()([mask_output, input_permutation]) # mask_output : [B, len_vj] mask_output = layers.Lambda(lambda x : K.sum(mask_output, axis=1))(mask_output) fc = layers.Dense(512, activation='relu', kernel_regularizer=l2(l2_constant))(mask_output) fc = layers.Dense(1024, activation='relu', kernel_regularizer=l2(l2_constant))(fc) fc = layers.Dense(784, activation='sigmoid', kernel_regularizer=l2(l2_constant), name='reconstruct')(fc) model = Model([input_img, input_mask, input_permutation], [output, fc], name='image-capsnet') if summary: model.summary() # compile model losses = {"pred_output" : margin_loss, "reconstruct": reconstruct_loss} loss_weights = {"pred_output": 1.0, "reconstruct" : 0.0005*784} metrics = {"pred_output" : 'accuracy', "reconstruct" : "mae"} model.compile(loss=losses, loss_weights=loss_weights, optimizer=Adam(init_lr, beta_1=0.9, beta_2=0.999, amsgrad=True), metrics=metrics) return model
# initialize keys # keys = [tf.get_variable("Key_%d" % i, [EMBED_HIDDEN_SIZE], initializer=tf.random_normal_initializer(stddev=0.1)) # for i in range(NUM_BLOCKS)] def get_keys(x): keys = [key for key in range(vocab_size - NUM_BLOCKS, vocab_size)] return tf.squeeze(tf.reshape(keys, [1, -1])) def get_keys_shape(input_shape): return NUM_BLOCKS, # keys = get_keys(None) keys = layers.Lambda(get_keys, output_shape=get_keys_shape)(encoded_sentence) keys = embed_1(keys) print('embedded_keys', keys) keys = tf.split(keys, NUM_BLOCKS, axis=0) keys = [tf.squeeze(key, axis=0) for key in keys] # create the main Recurrent Entity Network cells last_state = RENLayer.REN(initial_batch_size=BATCH_SIZE, units=EMBED_HIDDEN_SIZE, num_blocks=NUM_BLOCKS, num_units_per_block=EMBED_HIDDEN_SIZE, vocab_size=vocab_size, keys=keys, activation=activation, initializer='normal')(encoded_sentence)
def build_rnn2(input, caption_gt, masks, config): down = KL.Conv2D(512, (3, 3), padding="same", activation="relu", name='gcap_down_imagefeature')(input) reshaped_conv5_3_feats = KL.Lambda( lambda x: tf.reshape(x, [config.BATCH_SIZE, 64, 512]))(down) conv_feats = reshaped_conv5_3_feats print("Building the RNN...") contexts = conv_feats reshaped_contexts = KL.Lambda(lambda x: tf.reshape(x, [-1, 512]))(contexts) temp1 = attend_1(reshaped_contexts) w_embedding = KL.Embedding(input_dim=5000, output_dim=512, name='gcap_embedding') # Setup the LSTM # Initialize the LSTM using the mean context # with tf.variable_scope("initialize"): context_mean = KL.Lambda(lambda x: tf.reduce_mean(x, axis=1))(conv_feats) initial_memory, initial_output = initialize(context_mean) initial_state = initial_memory, initial_output # Prepare to run predictions = [] outputs = [] current_inputs = [] num_steps = 15 last_output = initial_output last_memory = initial_memory last_word = KL.Lambda(lambda x: K.zeros([config.BATCH_SIZE], 'int32'))( input) last_state = last_output, last_memory alphas = [] cross_entropies = [] predictions_correct = [] lstm = KL.LSTM( 512, return_state=True, recurrent_activation='hard_sigmoid', name='gcap_lstm', unit_forget_bias=False) #(last_output,initial_state = initial_state) # Generate the words one by one for idx in range(num_steps): # Attention mechanism # with tf.variable_scope("attend"): # alpha = attend(contexts, last_output) # use 2 fc layers to attend temp2 = attend_2(last_output) temp2 = KL.Lambda(lambda x: tf.reshape( tf.tile(tf.expand_dims(x, 1), [1, 64, 1]), [-1, 512]))(temp2) temp = KL.Add()([temp1, temp2]) att_logits = attend_3(temp) att_logits = KL.Lambda(lambda x: tf.reshape(x, [-1, 64]))(att_logits) alpha = KL.Softmax()(att_logits) alpha1 = KL.RepeatVector(512)(alpha) alpha1 = KL.Permute((2, 1))(alpha1) context = KL.Multiply()([contexts, alpha1]) context = KL.Lambda(lambda x: tf.reduce_sum(x, axis=1))(context) tiled_masks = KL.Lambda( lambda x: tf.tile(tf.expand_dims(x[:, idx], 1), [1, 64]))(masks) masked_alpha = KL.Lambda(lambda x: tf.reshape(x * tiled_masks, [-1]))( alpha) alphas.append(masked_alpha) word_embed = w_embedding(last_word) # Apply the LSTM # with tf.variable_scope("lstm"): current_input = KL.Concatenate(axis=-1)([context, word_embed]) current_input = KL.Lambda(lambda x: tf.expand_dims(x, 1))( current_input) output, memory, cell_out = lstm(current_input, initial_state=list(last_state)) # state = memory, cell_out current_inputs.append(current_input) outputs.append(output) # Decode the expanded output of LSTM into a word # with tf.variable_scope("decode"): expanded_output = KL.Concatenate(axis=-1)( [output, context, word_embed]) logits = decode(expanded_output) # probs = KL.Lambda(lambda x: tf.nn.softmax(logits))(logits) prediction = KL.Lambda(lambda x: tf.argmax(x, 1))(logits) predictions.append(prediction) # Compute the loss for this step, if necessary masked_cross_entropy = KL.Lambda(lambda x: caption_loss(*x))( [caption_gt[:, idx], logits, masks[:, idx]]) cross_entropies.append(masked_cross_entropy) # ground_truth = KL.Lambda(lambda x: tf.cast(caption_gt[:, idx], tf.int64))(caption_gt) # prediction_correct = tf.where( # tf.equal(prediction, ground_truth), # tf.cast(masks[:, idx], tf.float32), # tf.cast(tf.zeros_like(prediction), tf.float32)) # predictions_correct.append(prediction_correct) last_output = output last_memory = memory last_state = state last_word = KL.Lambda(lambda x: tf.reshape( tf.cast(x[:, idx], tf.int32), [config.BATCH_SIZE]))(caption_gt) # # tf.get_variable_scope().reuse_variables() # Compute the final loss, if necessary cross_entropies = KL.Lambda(lambda x: tf.stack(x, axis=1))(cross_entropies) cross_entropy_loss = KL.Lambda( lambda x: tf.reduce_sum(x) / tf.reduce_sum(masks))(cross_entropies) alphas = KL.Lambda(lambda x: tf.reshape(tf.stack(x, axis=1), [1, 64, -1]))( alphas) attentions = KL.Lambda(lambda x: tf.reduce_sum(x, axis=2))(alphas) diffs = KL.Lambda(lambda x: tf.ones_like(x) - x)(attentions) attention_loss = KL.Lambda(lambda x: 0.01 * tf.nn.l2_loss(x) / (64))(diffs) total_loss = KL.Lambda(lambda x: cross_entropy_loss + x, name="caption_loss")(attention_loss) outputs = KL.Lambda( lambda x: tf.reshape(x, [config.BATCH_SIZE, num_steps, 512]))(outputs) predictions = KL.Lambda(lambda x: tf.reshape(tf.cast( x, tf.float32), [config.BATCH_SIZE, num_steps, 1]))(predictions) # outputs2 = KL.Lambda(lambda x: tf.concat([outputs,predictions],axis=0))(outputs) print("RNN built.") return outputs, predictions, total_loss
def build_hani(**model_params): """ :return: the network the mentioned in the Hani et el. paper: -------------------------------------------------------- Khalil-Hani, M., & Sung, L. S. (2014). A convolutional neural network approach for face verification. High Performance Computing & Simulation (HPCS), 2014 International Conference on, (3), 707–714. doi:10.1109/HPCSim.2014.6903759 """ def tanh_scaled(x): A = 1.7159 B = 2 / 3 return A * K.tanh(B * x) act = model_params.get('act', tanh_scaled) dropout = model_params.get('dropout', 0) batchnorm = model_params.get('batchnorm', False) loss = model_params.get('loss', contrastive_loss) learning_rate = model_params.get('learning_rate', 1e-3) input_shape = (IMAGES_DIM, IMAGES_DIM, 1) first_input = KL.Input(input_shape) second_input = KL.Input(input_shape) model = keras.Sequential() initialize_weights_conv = keras.initializers.RandomNormal( mean=0.0, stddev=0.01, seed=84) # filters initialize initialize_weights_dense = keras.initializers.RandomNormal( mean=0.0, stddev=0.2, seed=84) # dense initialize initialize_bias = keras.initializers.RandomNormal( mean=0.5, stddev=0.01, seed=84) # bias initialize model.add( KL.Conv2D(5, (6, 6), strides=(2, 2), activation=act, input_shape=input_shape, kernel_initializer=initialize_weights_conv, kernel_regularizer=l2(1e-2))) if batchnorm: model.add(KL.BatchNormalization()) model.add(KL.MaxPool2D()) model.add( KL.Conv2D(14, (6, 6), strides=(2, 2), activation=act, kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias, kernel_regularizer=l2(1e-2))) if batchnorm: model.add(KL.BatchNormalization()) model.add(KL.MaxPool2D()) model.add(KL.Dropout(dropout)) model.add( KL.Conv2D(60, (6, 6), activation=act, kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias, kernel_regularizer=l2(1e-2))) if batchnorm: model.add(KL.BatchNormalization()) model.add(KL.MaxPool2D()) model.add(KL.Flatten()) model.add( KL.Dense(40, activation=act, kernel_regularizer=l2(1e-4), kernel_initializer=initialize_weights_dense, bias_initializer=initialize_bias)) model.add( KL.Dense(40, activation=None, kernel_regularizer=l2(1e-4), kernel_initializer=initialize_weights_dense, bias_initializer=initialize_bias)) # Generate the encodings (feature vectors) for the two images encoded_l = model(first_input) encoded_r = model(second_input) # calculate similarity if loss == 'binary_crossentropy': L1_layer = KL.Lambda(lambda tensors: K.abs(tensors[0] - tensors[1])) L1_distance = L1_layer([encoded_l, encoded_r]) similarity = KL.Dense(1, activation='sigmoid', bias_initializer=initialize_bias)(L1_distance) else: similarity = KL.Lambda(euclidean_distance)([encoded_l, encoded_r]) # final network final_network = keras.Model(inputs=[first_input, second_input], outputs=similarity) optimizer = keras.optimizers.Adam(lr=learning_rate) print(loss) final_network.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) return final_network
def define_vanilla_CNN_ResNet( input_shape=None, classes=10, block="basic", residual_unit="v2", repetitions=[2, 2, 2, 2], initial_filters=64, activation="softmax", include_top=True, input_tensor=None, dropout=None, transition_dilation_rate=(1, 1), initial_strides=(2, 2), initial_kernel_size=(7, 7), initial_pooling="max", final_pooling=None, top="classification", num_gpus=1, ): """Builds a custom ResNet18 architecture. Args: input_shape: optional shape tuple, only to be specified if `include_top` is False (otherwise the input shape has to be `(224, 224, 3)` (with `channels_last` dim ordering) or `(3, 224, 224)` (with `channels_first` dim ordering). It should have exactly 3 dimensions, and width and height should be no smaller than 8. E.g. `(224, 224, 3)` would be one valid value. classes: The number of outputs at final softmax layer block: The block function to use. This is either `'basic'` or `'bottleneck'`. The original paper used `basic` for layers < 50. repetitions: Number of repetitions of various block units. At each block unit, the number of filters are doubled and the input size is halved. residual_unit: the basic residual unit, 'v1' for conv bn relu, 'v2' for bn relu conv. See [Identity Mappings in Deep Residual Networks](https://arxiv.org/abs/1603.05027) for details. dropout: None for no dropout, otherwise rate of dropout from 0 to 1. Based on [Wide Residual Networks.(https://arxiv.org/pdf/1605.07146) paper. transition_dilation_rate: Dilation rate for transition layers. For semantic segmentation of images use a dilation rate of (2, 2). initial_strides: Stride of the very first residual unit and MaxPooling2D call, with default (2, 2), set to (1, 1) for small images like cifar. initial_kernel_size: kernel size of the very first convolution, (7, 7) for imagenet and (3, 3) for small image datasets like tiny imagenet and cifar. See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details. initial_pooling: Determine if there will be an initial pooling layer, 'max' for imagenet and None for small image datasets. See [ResNeXt](https://arxiv.org/abs/1611.05431) paper for details. final_pooling: Optional pooling mode for feature extraction at the final model layer when `include_top` is `False`. - `None` means that the output of the model will be the 4D tensor output of the last convolutional layer. - `avg` means that global average pooling will be applied to the output of the last convolutional layer, and thus the output of the model will be a 2D tensor. - `max` means that global max pooling will be applied. top: Defines final layers to evaluate based on a specific problem type. Options are 'classification' for ImageNet style problems, 'segmentation' for problems like the Pascal VOC dataset, and None to exclude these layers entirely. Returns: The keras `Model`. """ input_shape, block_fn, residual_unit = init_model(input_shape, classes, include_top, block, residual_unit, activation) img_input = layers.Input(shape=input_shape, tensor=input_tensor) # IoT Node iot = define_cnn_architecture_IoT(img_input, initial_filters, initial_kernel_size, initial_strides) # edge edge, filters = define_cnn_architecture_edge( iot, repetitions[0], transition_dilation_rate, block_fn, initial_filters, dropout, residual_unit, initial_pooling, initial_strides, ) # fog node fog = layers.Lambda(lambda x: x * 1, name="node2_input")(edge) fog, filters = define_cnn_architecture_fog( fog, repetitions[1], transition_dilation_rate, block_fn, filters, dropout, residual_unit, ) # cloud node cloud = layers.Lambda(lambda x: x * 1, name="node1_input")(fog) cloud = define_cnn_architecture_cloud( cloud, repetitions[2], repetitions[3], transition_dilation_rate, block_fn, filters, dropout, residual_unit, input_shape, classes, activation, include_top, top, final_pooling, ) model, parallel_model = compile_keras_parallel_model( img_input, cloud, num_gpus) return model, parallel_model
def build_paper_network(**model_params): """ :return: the network the mentioned in the original paper: -------------------------------------------------------- Koch, Gregory, Richard Zemel, and Ruslan Salakhutdinov. "Siamese neural networks for one-shot image recognition." In ICML deep learning workshop, vol. 2. 2015. """ filter_size_conv1 = model_params.get('filter_size_conv1', 10) filter_size_conv2 = model_params.get('filter_size_conv2', 7) filter_size_conv3 = model_params.get('filter_size_conv3', 4) filter_size_conv4 = model_params.get('filter_size_conv4', 4) n_filters_conv1 = model_params.get('n_filters_conv1', 64) n_filters_conv2 = model_params.get('n_filters_conv2', 128) n_filters_conv3 = model_params.get('n_filters_conv3', 128) n_filters_conv4 = model_params.get('n_filters_conv4', 256) l2_conv1 = model_params.get('l2_conv1', 1e-2) l2_conv2 = model_params.get('l2_conv2', 1e-2) l2_conv3 = model_params.get('l2_conv3', 1e-2) l2_conv4 = model_params.get('l2_conv4', 1e-2) l2_dense = model_params.get('l2_dense', 1e-4) learning_rate = model_params.get('learning_rate', 1e-3) dense_size = model_params.get('dense_size', 4096) momentum = model_params.get('momentum', 0.5) decay = model_params.get('decay', 0.01) loss = model_params.get('loss', 'binary_crossentropy') input_shape = (IMAGES_DIM, IMAGES_DIM, 1) first_input = KL.Input(input_shape) second_input = KL.Input(input_shape) model = keras.Sequential() initialize_weights_conv = keras.initializers.RandomNormal( mean=0.0, stddev=0.01, seed=84) # filters initialize initialize_weights_dense = keras.initializers.RandomNormal( mean=0.0, stddev=0.2, seed=84) # dense initialize initialize_bias = keras.initializers.RandomNormal( mean=0.5, stddev=0.01, seed=84) # bias initialize model.add( KL.Conv2D(n_filters_conv1, (filter_size_conv1, filter_size_conv1), activation='relu', kernel_regularizer=l2(l2_conv1), kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias, input_shape=input_shape)) model.add(KL.MaxPool2D()) model.add( KL.Conv2D(n_filters_conv2, (filter_size_conv2, filter_size_conv2), activation='relu', kernel_regularizer=l2(l2_conv2), kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias)) model.add(KL.MaxPool2D()) model.add( KL.Conv2D(n_filters_conv3, (filter_size_conv3, filter_size_conv3), activation='relu', kernel_regularizer=l2(l2_conv3), kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias)) model.add(KL.MaxPool2D()) model.add( KL.Conv2D(n_filters_conv4, (filter_size_conv4, filter_size_conv4), activation='relu', kernel_regularizer=l2(l2_conv4), kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias)) model.add(KL.Flatten()) model.add( KL.Dense(dense_size, activation='sigmoid', kernel_regularizer=l2(l2_dense), kernel_initializer=initialize_weights_dense, bias_initializer=initialize_bias)) hidden_first = model(first_input) hidden_second = model(second_input) L1_layer = KL.Lambda(lambda tensors: K.abs(tensors[0] - tensors[1])) L1_distance = L1_layer([hidden_first, hidden_second]) similarity = KL.Dense(1, activation='sigmoid', bias_initializer=initialize_bias)(L1_distance) final_network = keras.Model(inputs=[first_input, second_input], outputs=similarity) optimizer = keras.optimizers.SGD(lr=learning_rate, momentum=momentum, decay=decay) final_network.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) return final_network
def build_vggface(**model_params): from keras_vggface.vggface import VGG16, RESNET50, SENET50 dense_layer_size_1 = model_params.get('dense_size_1', 1024) dense_layer_size_2 = model_params.get('dense_size_2', 512) learning_rate = model_params.get('learning_rate', 1e-3) momentum = model_params.get('momentum', 0.5) decay = model_params.get('decay', 0.01) pre_trained_model = model_params.get('pre_trained_model', 'vgg16') dropout_prob = model_params.get('dropout_prob', 0.2) use_second_dense_layer = model_params.get('use_second_dense_layer', False) loss = model_params.get('loss', 'binary_crossentropy') initialize_bias = keras.initializers.RandomNormal( mean=0.5, stddev=0.01, seed=84) # bias initialize initialize_weights = keras.initializers.glorot_uniform(seed=84) input_shape = (224, 224, 3) first_input = KL.Input(input_shape) second_input = KL.Input(input_shape) # remove the classifier layers and freeze the other layers if pre_trained_model == 'vgg16': vggface = VGG16() for i in range(6): vggface.layers.pop() elif pre_trained_model == 'resnet50': vggface = RESNET50() vggface.layers.pop() elif pre_trained_model == 'senet50': vggface = SENET50() vggface.layers.pop() else: raise Exception('Pretrained {} not familiar'.format( model_params['pre_trained_model'])) for layer in vggface.layers: layer.trainable = False new_model = keras.Sequential() new_model.add(vggface) new_model.add( KL.Dense(dense_layer_size_1, activation='relu', kernel_initializer=initialize_weights, bias_initializer=initialize_bias, kernel_regularizer=l2(1e-2))) new_model.add(KL.BatchNormalization()) new_model.add(KL.Dropout(dropout_prob)) if use_second_dense_layer: new_model.add( KL.Dense(dense_layer_size_2, activation='relu', kernel_initializer=initialize_weights, bias_initializer=initialize_bias, kernel_regularizer=l2(1e-2))) new_model.add(KL.Dropout(dropout_prob)) first_hidden = new_model(first_input) second_hidden = new_model(second_input) L1_layer = KL.Lambda(lambda tensors: K.abs(tensors[0] - tensors[1])) L1_distance = L1_layer([first_hidden, second_hidden]) similarity = KL.Dense(1, activation='sigmoid', kernel_initializer=initialize_weights, bias_initializer=initialize_bias)(L1_distance) final_network = keras.Model(inputs=[first_input, second_input], outputs=similarity) optimizer = keras.optimizers.Adam(lr=learning_rate) final_network.compile(loss=loss, optimizer=optimizer, metrics=['accuracy']) return final_network
def build_hani_best_model(**model_params): """ :return: the network the mentioned in the Hani et el. paper: -------------------------------------------------------- Khalil-Hani, M., & Sung, L. S. (2014). A convolutional neural network approach for face verification. High Performance Computing & Simulation (HPCS), 2014 International Conference on, (3), 707–714. doi:10.1109/HPCSim.2014.6903759 but with optimized hyperparmeters after the hyperas exection """ act = model_params.get('act', 'relu') dropout = model_params.get('dropout', 0) batchnorm = model_params.get('batchnorm', False) loss = model_params.get('loss', contrastive_loss) learning_rate = model_params.get('learning_rate', 1e-3) input_shape = (IMAGES_DIM, IMAGES_DIM, 1) first_input = KL.Input(input_shape) second_input = KL.Input(input_shape) model = keras.Sequential() initialize_weights_conv = keras.initializers.glorot_uniform( seed=84) # filters initialize initialize_weights_dense = keras.initializers.glorot_uniform( seed=84) # dense initialize initialize_bias = keras.initializers.RandomNormal( mean=0.5, stddev=0.01, seed=84) # bias initialize model.add( KL.Conv2D(5, (6, 6), strides=(2, 2), activation=act, input_shape=input_shape, kernel_initializer=initialize_weights_conv, kernel_regularizer=l2(0.03148394777069553))) model.add(KL.BatchNormalization()) model.add(KL.Dropout(0.3065491917788273)) model.add(KL.MaxPool2D()) model.add( KL.Conv2D(14, (6, 6), strides=(2, 2), activation=act, kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias, kernel_regularizer=l2(0.054048669207277224))) #model.add(KL.BatchNormalization()) model.add(KL.Dropout(0.4797699256757003)) model.add(KL.MaxPool2D()) model.add( KL.Conv2D(60, (6, 6), activation=act, kernel_initializer=initialize_weights_conv, bias_initializer=initialize_bias, kernel_regularizer=l2(0.06189584230948173))) model.add(KL.BatchNormalization()) model.add(KL.Dropout(0.020012398358003752)) model.add(KL.MaxPool2D()) model.add(KL.Flatten()) model.add( KL.Dense(40, activation=act, kernel_regularizer=l2(0.082430594544267), kernel_initializer=initialize_weights_dense, bias_initializer=initialize_bias)) model.add(KL.Dropout(0.012533877486030926)) model.add( KL.Dense(40, activation=None, kernel_regularizer=l2(0.046085917780636185), kernel_initializer=initialize_weights_dense, bias_initializer=initialize_bias)) model.add(KL.Dropout(0.05086327591390307)) # Generate the encodings (feature vectors) for the two images encoded_l = model(first_input) encoded_r = model(second_input) # calculate similarity L1_distance = KL.Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))( [encoded_l, encoded_r]) similarity = KL.Dense(1, activation='sigmoid', kernel_initializer=initialize_weights_dense, bias_initializer=initialize_bias)(L1_distance) final_network = keras.Model(inputs=[first_input, second_input], outputs=similarity) optimizer = keras.optimizers.SGD(lr=0.03863427079945416, momentum=0.8962431889503087, decay=0.019965108317109886) final_network.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) return final_network
def make_parallel(self): """Creates a new wrapper model that consists of multiple replicas of the original model placed on different GPUs. """ # Slice inputs. Slice inputs on the CPU to avoid sending a copy # of the full inputs to all GPUs. Saves on bandwidth and memory. print('input_name:', self.inner_model.input_names) print('inputs:', self.inner_model.inputs) input_slices = { name: tf.split(x, self.gpu_count) for name, x in zip(self.inner_model.input_names, self.inner_model.inputs) } print('input_slices:', input_slices) output_names = self.inner_model.output_names outputs_all = [] for i in range(len(self.inner_model.outputs)): outputs_all.append([]) print('outputs_all:', outputs_all) # Run the model call() on each GPU to place the ops there for i in range(self.gpu_count): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i): # Run a slice of inputs through this replica zipped_inputs = zip(self.inner_model.input_names, self.inner_model.inputs) inputs = [ KL.Lambda(lambda s: input_slices[name][i], output_shape=lambda s: (None, ) + s[1:])(tensor) for name, tensor in zipped_inputs ] # Create the model replica and get the outputs outputs = self.inner_model(inputs) if not isinstance(outputs, list): outputs = [outputs] # Save the outputs for merging back together later for l, o in enumerate(outputs): outputs_all[l].append(o) # Merge outputs on CPU with tf.device('/cpu:0'): merged = [] for outputs, name in zip(outputs_all, output_names): # Concatenate or average outputs? # Outputs usually have a batch dimension and we concatenate # across it. If they don't, then the output is likely a loss # or a metric value that gets averaged across the batch. # Keras expects losses and metrics to be scalars. if K.int_shape(outputs[0]) == (): # Average m = KL.Lambda(lambda o: tf.add_n(o) / len(outputs), name=name)(outputs) else: # Concatenate m = KL.Concatenate(axis=0, name=name)(outputs) merged.append(m) return merged
def build_network(self): s = keras_layers.Input(shape=self.nn.input_dims, dtype='float32', name='s') G = keras_layers.Input(shape=(1, ), dtype='float32', name='G') if self.nn.input_type == INPUT_TYPE_OBSERVATION_VECTOR: x = keras_layers.Dense( self.nn.fc_layers_dims[0], activation='relu', kernel_initializer=keras_init.he_normal())(s) else: # self.input_type == INPUT_TYPE_STACKED_FRAMES x = keras_layers.Conv2D( filters=32, kernel_size=(8, 8), strides=4, name='conv1', kernel_initializer=keras_init.he_normal())(s) x = keras_layers.BatchNormalization(epsilon=1e-5, name='conv1_bn')(x) x = keras_layers.Activation('relu', name='conv1_bn_ac')(x) x = keras_layers.Conv2D( filters=64, kernel_size=(4, 4), strides=2, name='conv2', kernel_initializer=keras_init.he_normal())(x) x = keras_layers.BatchNormalization(epsilon=1e-5, name='conv2_bn')(x) x = keras_layers.Activation('relu', name='conv2_bn_ac')(x) x = keras_layers.Conv2D( filters=128, kernel_size=(3, 3), strides=1, name='conv3', kernel_initializer=keras_init.he_normal())(x) x = keras_layers.BatchNormalization(epsilon=1e-5, name='conv3_bn')(x) x = keras_layers.Activation('relu', name='conv3_bn_ac')(x) x = keras_layers.Flatten()(x) x = keras_layers.Dense( self.nn.fc_layers_dims[-1], activation='relu', kernel_initializer=keras_init.he_normal())(x) if self.nn.is_discrete_action_space: pi = keras_layers.Dense( self.nn.n_actions, activation='softmax', name='pi', # a_probs = the stochastic policy (π) kernel_initializer=keras_init.glorot_normal())(x) self.policy = keras_models.Model(inputs=s, outputs=pi) self.model = keras_models.Model(inputs=[s, G], outputs=pi) # policy_model else: mu = keras_layers.Dense( self.nn.n_actions, name='mu', # Mean (μ) kernel_initializer=keras_init.glorot_normal())(x) sigma_unactivated = keras_layers.Dense( self.nn.n_actions, name= 'sigma_unactivated', # unactivated STD (σ) - can be a negative number kernel_initializer=keras_init.glorot_normal())(x) # Element-wise exponential: e^(sigma_unactivated): # we activate sigma since STD (σ) is strictly real-valued (positive, non-zero - it's not a Dirac delta function). sigma = keras_layers.Lambda( lambda sig: keras_backend.exp(sig), # STD (σ) name='sigma')(sigma_unactivated) self.policy = keras_models.Model(inputs=s, outputs=[mu, sigma]) self.model = keras_models.Model(inputs=[s, G], outputs=[mu, sigma ]) # policy_model is_discrete_action_space = self.nn.is_discrete_action_space def custom_loss( y_true, y_pred ): # (a_indices_one_hot, actor.output - pi \ [mu, sigma]) if is_discrete_action_space: prob_chosen_a = keras_backend.sum( y_pred * y_true) # outputs the prob of the chosen a prob_chosen_a = keras_backend.clip( prob_chosen_a, 1e-8, 1 - 1e-8) # boundaries to prevent from taking log of 0\1 log_prob_chosen_a = keras_backend.log( prob_chosen_a ) # log_probability, negative value (since prob<1) loss = -log_prob_chosen_a * G else: mu_pred, sigma_pred = y_pred[0], y_pred[ 1] # Mean (μ), STD (σ) gaussian_dist = tfp.distributions.Normal(loc=mu_pred, scale=sigma_pred) a_log_prob = gaussian_dist.log_prob(y_true[0]) loss = -keras_backend.mean(a_log_prob) * G return loss optimizer = keras_get_optimizer(self.nn.optimizer_type, self.nn.ALPHA) self.model.compile(optimizer, loss=custom_loss)
def fpn_classifier_graph(rois, feature_maps, image_meta, pool_size, num_classes, train_bn=True, fc_layers_size=1024): """Builds the computation graph of the feature pyramid network classifier and regressor heads. rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates. feature_maps: List of feature maps from diffent layers of the pyramid, [P2, P3, P4, P5]. Each has a different resolution. - image_meta: [batch, (meta data)] Image details. See compose_image_meta() pool_size: The width of the square feature map generated from ROI Pooling. num_classes: number of classes, which determines the depth of the results train_bn: Boolean. Train or freeze Batch Norm layres Returns: logits: [N, NUM_CLASSES] classifier logits (before softmax) probs: [N, NUM_CLASSES] classifier probabilities bbox_deltas: [N, (dy, dx, log(dh), log(dw))] Deltas to apply to proposal boxes """ # ROI Pooling # Shape: [batch, num_boxes, pool_height, pool_width, channels] x = modellib.PyramidROIAlign( [pool_size, pool_size], name="roi_align_classifier")([rois, image_meta] + feature_maps) # Two 1024 FC layers (implemented with Conv2D for consistency) x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"), name="mrcnn_class_conv1")(x) x = KL.TimeDistributed(modellib.BatchNorm(), name='mrcnn_class_bn1')(x, training=train_bn) x = KL.Activation('relu')(x) x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (1, 1)), name="mrcnn_class_conv2")(x) x = KL.TimeDistributed(modellib.BatchNorm(), name='mrcnn_class_bn2')(x, training=train_bn) x = KL.Activation('relu')(x) shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2), name="pool_squeeze")(x) # Classifier head mrcnn_class_logits = KL.TimeDistributed(KL.Dense(num_classes), name='mrcnn_class_logits')(shared) mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"), name="mrcnn_class")(mrcnn_class_logits) # BBox head # [batch, boxes, num_classes * (dy, dx, log(dh), log(dw))] x = KL.TimeDistributed(KL.Dense(4, activation='linear'), name='mrcnn_bbox_fc')(shared) # Reshape to [batch, boxes, num_classes, (dy, dx, log(dh), log(dw))] s = K.int_shape(x) x = KL.Reshape((s[1], 1, 4), name="mrcnn_bbox")(x) # Duplicate output for fg/bg detections mrcnn_bbox = KL.Concatenate(axis=-2)([x for i in range(num_classes)]) return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox
epsilon = K.random_normal(shape=(batch, dim)) return z_mean + K.exp(0.5 * z_log_var) * epsilon # x = tf.placeholder(tf.float32, shape=[batch_size, 32, 32, 3]) # x = layers.Input(batch_shape=(batch_size, 32, 32, 3)) x = layers.Input(shape=(32, 32, 3)) encoded = encoder(x) mean = layers.Dense(1024, activation=tf.nn.softplus)(encoded) sigma = layers.Dense(1024, activation=tf.nn.relu)(encoded) # z = mean + tf.multiply(tf.sqrt(tf.exp(sigma)), # tf.random_normal(shape=(batch_size, 1024))) z = layers.Lambda(sampling)([mean, sigma]) my_encoder = keras.models.Model(x, [mean, sigma, z]) latent_inputs = layers.Input(shape=(1024, )) x_reco = decoder(latent_inputs) my_decoder = keras.models.Model(latent_inputs, x_reco) x_reco = my_decoder(my_encoder(x)[2]) my_vae = keras.models.Model(x, x_reco) reconstruction_term = -tf.reduce_sum( tfp.distributions.MultivariateNormalDiag( layers.Reshape( (3072, ))(x_reco), scale_identity_multiplier=0.05).log_prob( layers.Reshape((3072, ))(x)))
def build(self, mode, config): """Build Mask R-CNN architecture. input_shape: The shape of the input image. mode: Either "training" or "inference". The inputs and outputs of the model differ accordingly. """ assert mode in ['training', 'inference'] # Image size must be dividable by 2 multiple times h, w = config.IMAGE_SHAPE[:2] if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = KL.Input(shape=config.IMAGE_SHAPE.tolist(), name="input_image") # CHANGE: add target input if not config.NUM_TARGETS: config.NUM_TARGETS = 1 input_target = KL.Input(shape=[config.NUM_TARGETS] + config.TARGET_SHAPE.tolist(), name="input_target") input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta") if mode == "training": # RPN GT input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = KL.Lambda(lambda x: modellib.norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if config.USE_MINI_MASK: input_gt_masks = KL.Input(shape=[ config.MINI_MASK_SHAPE[0], config.MINI_MASK_SHAPE[1], None ], name="input_gt_masks", dtype=bool) else: input_gt_masks = KL.Input( shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name="input_gt_masks", dtype=bool) elif mode == "inference": # Anchors in normalized coordinates input_anchors = KL.Input(shape=[None, 4], name="input_anchors") # Build the shared convolutional layers. # CHANGE: Use weightshared FPN model for image and target # Create FPN Model resnet = build_resnet_model(self.config) fpn = build_fpn_model(feature_maps=self.config.FPN_FEATUREMAPS) # Create Image FP _, IC2, IC3, IC4, IC5 = resnet(input_image) IP2, IP3, IP4, IP5, IP6 = fpn([IC2, IC3, IC4, IC5]) # Create Target FR input_targets = [ KL.Lambda(lambda x: x[:, idx, ...])(input_target) for idx in range(input_target.shape[1]) ] for k, one_target in enumerate(input_targets): _, TC2, TC3, TC4, TC5 = resnet(one_target) out = fpn([TC2, TC3, TC4, TC5]) if k == 0: target_pyramid = out else: target_pyramid = [ KL.Add(name="target_adding_{}_{}".format(k, i))( [target_pyramid[i], out[i]]) for i in range(len(out)) ] TP2, TP3, TP4, TP5, TP6 = [ KL.Lambda(lambda x: x / config.NUM_TARGETS)(target_pyramid[i]) for i in range(len(target_pyramid)) ] # one_target = KL.Lambda(lambda x: x[:,0,...])(input_target) # one_target = input_target[:,0,...] # _, TC2, TC3, TC4, TC5 = resnet(one_target) # TP2, TP3, TP4, TP5, TP6 = fpn([TC2, TC3, TC4, TC5]) # CHANGE: add siamese distance copmputation # Combine FPs using L1 distance P2 = l1_distance_graph(IP2, TP2, feature_maps=3 * self.config.FPN_FEATUREMAPS // 2, name='P2') P3 = l1_distance_graph(IP3, TP3, feature_maps=3 * self.config.FPN_FEATUREMAPS // 2, name='P3') P4 = l1_distance_graph(IP4, TP4, feature_maps=3 * self.config.FPN_FEATUREMAPS // 2, name='P4') P5 = l1_distance_graph(IP5, TP5, feature_maps=3 * self.config.FPN_FEATUREMAPS // 2, name='P5') P6 = l1_distance_graph(IP6, TP6, feature_maps=3 * self.config.FPN_FEATUREMAPS // 2, name='P6') # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [P2, P3, P4, P5, P6] mrcnn_feature_maps = [P2, P3, P4, P5] # Anchors if mode == "training": anchors = self.get_anchors(config.IMAGE_SHAPE) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (config.BATCH_SIZE, ) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) else: anchors = input_anchors # RPN Model # CHANGE: Set number of filters to [3*self.config.FPN_FEATUREMAPS//2] rpn = modellib.build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), 3 * self.config.FPN_FEATUREMAPS // 2) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"\ else config.POST_NMS_ROIS_INFERENCE rpn_rois = modellib.ProposalLayer( proposal_count=proposal_count, nms_threshold=config.RPN_NMS_THRESHOLD, name="ROI", config=config)([rpn_class, rpn_bbox, anchors]) if mode == "training": # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = KL.Lambda( lambda x: modellib.parse_image_meta_graph(x)[ "active_class_ids"])(input_image_meta) if not config.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = KL.Lambda(lambda x: modellib.norm_boxes_graph( x, K.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask =\ modellib.DetectionTargetLayer(config, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function if config.MODEL == 'mrcnn': mrcnn_mask = fpn_mask_graph(rois, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN) # TODO: clean up (use tf.identify if necessary) output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = KL.Lambda( lambda x: modellib.rpn_class_loss_graph(*x), name="rpn_class_loss")([input_rpn_match, rpn_class_logits]) rpn_bbox_loss = KL.Lambda( lambda x: modellib.rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) # CHANGE: use custom class loss without using active_class_ids class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = KL.Lambda(lambda x: modellib.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([ target_bbox, target_class_ids, mrcnn_bbox ]) if config.MODEL == 'mrcnn': mask_loss = KL.Lambda( lambda x: modellib.mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")( [target_mask, target_class_ids, mrcnn_mask]) # Model # CHANGE: Added target to inputs inputs = [ input_image, input_image_meta, input_target, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks ] if not config.USE_RPN_ROIS: inputs.append(input_rois) if config.MODEL == 'mrcnn': outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] elif config.MODEL == 'frcnn': outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss ] model = KM.Model(inputs, outputs, name='mask_rcnn') else: # Network Heads # Proposal classifier and BBox regressor heads # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function mrcnn_class_logits, mrcnn_class, mrcnn_bbox =\ fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta, config.POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN, fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = modellib.DetectionLayer(config, name="mrcnn_detection")([ rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta ]) # Create masks for detections detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections) # CHANGE: reduce number of classes to 2 # CHANGE: replaced with custom 2 class function if config.MODEL == 'mrcnn': mrcnn_mask = fpn_mask_graph(detection_boxes, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, num_classes=2, train_bn=config.TRAIN_BN) # CHANGE: Added target to the input inputs = [ input_image, input_image_meta, input_target, input_anchors ] if config.MODEL == 'mrcnn': outputs = [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ] elif config.MODEL == 'frcnn': outputs = [ detections, mrcnn_class, mrcnn_bbox, rpn_rois, rpn_class, rpn_bbox ] model = KM.Model(inputs, outputs, name='mask_rcnn') # Add multi-GPU support. if config.GPU_COUNT > 1: from mrcnn.parallel_model import ParallelModel model = ParallelModel(model, config.GPU_COUNT) return model
def multi_gpu_model(model, gpus=None): """Replicates a model on different GPUs. Specifically, this function implements single-machine multi-GPU data parallelism. It works in the following way: - Divide the model's input(s) into multiple sub-batches. - Apply a model copy on each sub-batch. Every model copy is executed on a dedicated GPU. - Concatenate the results (on CPU) into one big batch. E.g. if your `batch_size` is 64 and you use `gpus=2`, then we will divide the input into 2 sub-batches of 32 samples, process each sub-batch on one GPU, then return the full batch of 64 processed samples. This induces quasi-linear speedup on up to 8 GPUs. This function is only available with the TensorFlow backend for the time being. # Arguments model: A Keras model instance. To avoid OOM errors, this model could have been built on CPU, for instance (see usage example below). gpus: Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas. # Returns A Keras `Model` instance which can be used just like the initial `model` argument, but which distributes its workload on multiple GPUs. # Example ```python import tensorflow as tf from keras.applications import Xception from keras.utils import multi_gpu_model import numpy as np num_samples = 1000 height = 224 width = 224 num_classes = 1000 # Instantiate the base model (or "template" model). # We recommend doing this with under a CPU device scope, # so that the model's weights are hosted on CPU memory. # Otherwise they may end up hosted on a GPU, which would # complicate weight sharing. with tf.device('/cpu:0'): model = Xception(weights=None, input_shape=(height, width, 3), classes=num_classes) # Replicates the model on 8 GPUs. # This assumes that your machine has 8 available GPUs. parallel_model = multi_gpu_model(model, gpus=8) parallel_model.compile(loss='categorical_crossentropy', optimizer='rmsprop') # Generate dummy data. x = np.random.random((num_samples, height, width, 3)) y = np.random.random((num_samples, num_classes)) # This `fit` call will be distributed on 8 GPUs. # Since the batch size is 256, each GPU will process 32 samples. parallel_model.fit(x, y, epochs=20, batch_size=256) # Save model via the template model (which shares the same weights): model.save('my_model.h5') ``` # On model saving To save the multi-gpu model, use `.save(fname)` or `.save_weights(fname)` with the template model (the argument you passed to `multi_gpu_model`), rather than the model returned by `multi_gpu_model`. """ """ if K.backend() != 'tensorflow': raise ValueError('`multi_gpu_model` is only available ' 'with the TensorFlow backend.') available_devices = _get_available_devices() available_devices = [_normalize_device_name(name) for name in available_devices] if not gpus: # Using all visible GPUs when not specifying `gpus` # e.g. CUDA_VISIBLE_DEVICES=0,2 python3 keras_mgpu.py gpus = len([x for x in available_devices if 'gpu' in x]) """ if isinstance(gpus, (list, tuple)): if len(gpus) <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `len(gpus) >= 2`. ' 'Received: `gpus=%s`' % gpus) num_gpus = len(gpus) target_gpu_ids = gpus else: if gpus <= 1: raise ValueError('For multi-gpu usage to be effective, ' 'call `multi_gpu_model` with `gpus >= 2`. ' 'Received: `gpus=%d`' % gpus) num_gpus = gpus target_gpu_ids = range(num_gpus) import tensorflow as tf target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids] def get_slice(data, i, parts): shape = tf.shape(data) batch_size = shape[:1] input_shape = shape[1:] step = batch_size // parts if i == num_gpus - 1: size = batch_size - step * i else: size = step size = tf.concat([size, input_shape], axis=0) stride = tf.concat([step, input_shape * 0], axis=0) start = stride * i return tf.slice(data, start, size) all_outputs = [] for i in range(len(model.outputs)): all_outputs.append([]) # Place a copy of the model on each GPU, # each getting a slice of the inputs. for i, gpu_id in enumerate(target_gpu_ids): with tf.device('/gpu:%d' % gpu_id): with tf.name_scope('replica_%d' % gpu_id): inputs = [] # Retrieve a slice of the input. for x in model.inputs: input_shape = tuple(x.get_shape().as_list())[1:] slice_i = KL.Lambda(get_slice, output_shape=input_shape, arguments={'i': i, 'parts': num_gpus})(x) inputs.append(slice_i) # Apply model on slice # (creating a model replica on the target device). outputs = model(inputs) if not isinstance(outputs, list): outputs = [outputs] # Save the outputs for merging back together later. for o in range(len(outputs)): all_outputs[o].append(outputs[o]) # Merge outputs on CPU. with tf.device('/cpu:0'): merged = [] for name, outputs in zip(model.output_names, all_outputs): # If outputs are numbers without dimensions, add a batch dim. def add_dim(tensor): """Add a dimension to tensors that don't have any.""" if K.int_shape(tensor) == (): return KL.Lambda(lambda t: K.reshape(t, [1, 1]))(tensor) return tensor outputs = list(map(add_dim, outputs)) verbose = 0 if verbose: print ('---------------->') for each in outputs: print (each) merged.append(KL.concatenate(outputs, axis=0, name=name)) return KM.Model(model.inputs, merged)
def build(self, mode, subnet, config): assert mode in ["training", "inference"] input_image = KL.Input(shape=[64, 64, 3], dtype=tf.float32) input_bboxes = KL.Input(shape=[None, 4], dtype=tf.float32) input_class_ids = KL.Input(shape=[None], dtype=tf.int32) input_active_ids = KL.Input(shape=[4, ], dtype=tf.int32) input_rpn_match = KL.Input(shape=[None, 1], dtype=tf.int32) input_rpn_bbox = KL.Input(shape=[None, 4], dtype=tf.float32) h, w = config.image_size[: 2] image_scale = K.cast(K.stack([h, w, h, w], axis=0), tf.float32) gt_bboxes = KL.Lambda(lambda x: x / image_scale)(input_bboxes) feature_map = resNet_featureExtractor(input_image) rpn_class, rpn_prob, rpn_bbox = rpn_net(feature_map, 9) anchors = utils.anchor_gen(featureMap_size=[8, 8], ratios=config.ratios, scales=config.scales, \ rpn_stride=config.rpn_stride, anchor_stride=config.anchor_stride) proposals = proposal_func.proposal(proposal_count=16, nms_thresh=0.7, anchors=anchors, \ batch_size=20, config=config)([rpn_prob, rpn_bbox]) if mode == "training": target_rois, target_class_ids, target_delta, target_bboxes = detection_target_fixed.DetectionTarget( config=config, \ name="proposal_target")([proposals, input_class_ids, gt_bboxes]) denomrlaize_rois = KL.Lambda(lambda x: 8.0 * x, name="denormalized_rois")(target_rois) mrcnn_class_logits, mrcnn_class, mrcnn_bbox = fpn_classifiler(feature_map, denomrlaize_rois, 20, 21, 7, 4) loss_rpn_match = KL.Lambda(lambda x: rpn_class_loss(*x), name="loss_rpn_match")( [input_rpn_match, rpn_class]) loss_rpn_bbox = KL.Lambda(lambda x: rpn_bbox_loss(*x), name="loss_rpn_bbox")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="bbox_loss")( [target_delta, target_class_ids, mrcnn_bbox]) class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graphV2(*x), name="mrcnn_class_loss")( [target_class_ids, mrcnn_class_logits, input_active_ids]) if subnet == "rpn": model = Model( [input_image, input_bboxes, input_class_ids, input_active_ids, input_rpn_match, input_rpn_bbox], [feature_map, rpn_class, rpn_prob, rpn_bbox, proposals, target_rois, denomrlaize_rois, target_class_ids, target_delta, target_bboxes, \ loss_rpn_match, loss_rpn_bbox]) elif subnet == "all": model = Model( [input_image, input_bboxes, input_class_ids, input_active_ids, input_rpn_match, input_rpn_bbox], [feature_map, rpn_class, rpn_prob, rpn_bbox, proposals, target_rois, denomrlaize_rois, target_class_ids, target_delta, target_bboxes, \ mrcnn_class_logits, mrcnn_class, mrcnn_bbox, loss_rpn_match, loss_rpn_bbox, bbox_loss, class_loss]) if mode == "inference": denomrlaize_proposals = KL.Lambda(lambda x: 8.0 * x, name="denormalized_proposals")(proposals) mrcnn_class_logits, mrcnn_class, mrcnn_bbox = fpn_classifiler(feature_map, denomrlaize_proposals, 20, 16, 7, 4) detections = DetectionLayer()([proposals, mrcnn_class, mrcnn_bbox]) model = Model([input_image], [detections]) return model
def make_parallel(self): """Creates a new wrapper model that consists of multiple replicas of the original model placed on different GPUs. """ # Slice inputs. Slice inputs on the CPU to avoid sending a copy # of the full inputs to all GPUs. Saves on bandwidth and memory. if self.verbose: for each in zip(self.inner_model.input_names, self.inner_model.inputs): print ('---> ', each) input_slices = {name: tf.split(x, self.gpu_count) for name, x in zip(self.inner_model.input_names, self.inner_model.inputs)} output_names = self.inner_model.output_names outputs_all = [] for i in range(len(self.inner_model.outputs)): outputs_all.append([]) # Run the model call() on each GPU to place the ops there for i in range(self.gpu_count): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_%d' % i): # Run a slice of inputs through this replica zipped_inputs = zip(self.inner_model.input_names, self.inner_model.inputs) inputs = [ KL.Lambda(lambda s: input_slices[name][i], output_shape=lambda s: (None,) + s[1:])(tensor) for name, tensor in zipped_inputs] # Create the model replica and get the outputs if self.verbose: if i == 0: print ('\ntower_{0} - i/p '.format(i)) for each in inputs: print ('--->', each) outputs = self.inner_model(inputs) if self.verbose: if i == 0: print ('\ntower_{0} - o/p '.format(i)) for each in outputs: print ('--->', each) if not isinstance(outputs, list): outputs = [outputs] # Save the outputs for merging back together later for l, o in enumerate(outputs): outputs_all[l].append(o) # Merge outputs on CPU with tf.device('/cpu:0'): merged = [] for outputs, name in zip(outputs_all, output_names): # If outputs are numbers without dimensions, add a batch dim. def add_dim(tensor): """Add a dimension to tensors that don't have any.""" if K.int_shape(tensor) == (): return KL.Lambda(lambda t: K.reshape(t, [1, 1]))(tensor) return tensor outputs = list(map(add_dim, outputs)) verbose = 0 if verbose: print ('---------------->') for each in outputs: print (each) # Concatenate merged.append(KL.Concatenate(axis=0, name=name)(outputs)) return merged
def build_rnn(input, config): ctx = 64 down = KL.Conv2D(512, (3, 3), padding="same", activation="relu", name='gcap_down_imagefeature')(input) reshaped_conv5_3_feats = KL.Lambda( lambda x: tf.reshape(x, [config.BATCH_SIZE, ctx, 512]))(down) conv_feats = reshaped_conv5_3_feats print("Building the RNN...") contexts = conv_feats reshaped_contexts = KL.Lambda(lambda x: tf.reshape(x, [-1, 512]))(contexts) temp1 = attend_1(reshaped_contexts) w_embedding = KL.Embedding(input_dim=5000, output_dim=512, name='gcap_embedding') # Setup the LSTM # Initialize the LSTM using the mean context # with tf.variable_scope("initialize"): context_mean = KL.Lambda(lambda x: tf.reduce_mean(x, axis=1))(conv_feats) initial_memory, initial_output = initialize(context_mean) initial_state = initial_memory, initial_output # Prepare to run predictions = [] outputs = [] current_inputs = [] num_steps = 15 last_output = initial_output last_memory = initial_memory last_word = KL.Lambda(lambda x: K.zeros([config.BATCH_SIZE], 'int32'))( input) last_state = last_output, last_memory alphas = [] att_masks = [] cross_entropies = [] predictions_correct = [] lstm = KL.LSTM( 512, return_state=True, recurrent_activation='hard_sigmoid', name='gcap_lstm', unit_forget_bias=False) # (last_output,initial_state = initial_state) # Generate the words one by one for idx in range(num_steps): # Attention mechanism # with tf.variable_scope("attend"): # alpha = attend(reshaped_contexts, last_output) # use 2 fc layers to attend temp2 = attend_2(last_output) temp2 = KL.Lambda(lambda x: tf.reshape( tf.tile(tf.expand_dims(x, 1), [1, ctx, 1]), [-1, 512]))(temp2) temp = KL.Add()([temp1, temp2]) att_logits = attend_3(temp) att_logits = KL.Lambda(lambda x: tf.reshape(x, [-1, ctx]))(att_logits) alpha = KL.Softmax()(att_logits) alpha1 = KL.RepeatVector(512)(alpha) alpha1 = KL.Permute((2, 1))(alpha1) context = KL.Multiply()([contexts, alpha1]) context = KL.Lambda(lambda x: tf.reduce_sum(x, axis=1))(context) alphas.append(alpha) word_embed = w_embedding(last_word) # Apply the LSTM # with tf.variable_scope("lstm"): current_input = KL.Concatenate(axis=-1)([context, word_embed]) current_input = KL.Lambda(lambda x: tf.expand_dims(x, 1))( current_input) output, memory, cell_out = lstm(current_input, initial_state=list(last_state)) # state = memory, cell_out current_inputs.append(current_input) outputs.append(output) # Decode the expanded output of LSTM into a word # with tf.variable_scope("decode"): expanded_output = KL.Concatenate(axis=-1)( [output, context, word_embed]) logits = decode(expanded_output) # probs = KL.Lambda(lambda x: tf.nn.softmax(logits))(logits) prediction = KL.Lambda(lambda x: tf.argmax(x, 1))(logits) predictions.append(prediction) last_output = output last_memory = memory last_state = state if idx == 0: att_mask = KL.Lambda(lambda x: K.switch(tf.equal(x[ 0], 0), tf.constant(0.0), tf.constant(1.0)))(last_word) else: att_mask = KL.Lambda(lambda x: K.switch(tf.equal(x[ 0], 2), tf.constant(0.0), tf.constant(1.0)))(last_word) att_masks.append(att_mask) last_word = KL.Lambda(lambda x: tf.cast(x, tf.int32))(prediction) # # tf.get_variable_scope().reuse_variables() # Compute the final loss, if necessary outputs = KL.Lambda( lambda x: tf.reshape(x, [config.BATCH_SIZE, num_steps, 512]))(outputs) predictions = KL.Lambda(lambda x: tf.reshape(tf.cast( x, tf.float32), [config.BATCH_SIZE, num_steps, 1]))(predictions) att_masks = KL.Lambda(lambda x: tf.reshape(tf.cast( x, tf.float32), [num_steps, 1, 1, 1]))(att_masks) alphas = KL.Lambda( lambda x: tf.reshape(x, [config.BATCH_SIZE, num_steps, ctx]))(alphas) print("RNN built.") return outputs, predictions, alphas, att_masks
def make_model(self): inputs = K_layer.Input(shape=(self.timesteps, self.input_dim)) #sin_layer = K_layer.Lambda(lambda x: K.sin(x), output_shape=(self.timesteps, self.input_dim)) #cos_layer = K_layer.Lambda(lambda x: K.cos(x), output_shape=(self.timesteps, self.input_dim)) #decomposed = K_layer.concatenate([sin_layer(inputs), cos_layer(inputs)], axis=1) reshaped = K_layer.Reshape( (self.partial_n, self.partial_ts, self.input_dim))(inputs) encode_reshape = K_layer.Reshape((self.partial_n, self.latent_dim)) encode_1 = RNN_UNIT(self.latent_dim) encode_2 = RNN_UNIT(self.latent_dim, return_sequences=True) def encode_partials(seq): encoded = [None] * self.partial_n for i in range(self.partial_n): rs = K_layer.Lambda(lambda x: x[:, i], output_shape=(self.partial_ts, self.input_dim))(seq) encoded[i] = encode_1(rs) return encode_reshape(K_layer.concatenate(encoded, axis=1)) encoded = encode_partials(reshaped) print K.int_shape(encoded), K.int_shape(reshaped) encoded = encode_2(encoded) z = K_layer.Input(shape=(self.latent_dim, )) decoder_activation = 'tanh' decode_emb = K_layer.Dense(self.latent_dim / 2, activation=decoder_activation) #decode_euler_1 = K_layer.Dense(self.latent_dim/4, activation=decoder_activation) decode_euler_2 = K_layer.Dense(self.output_dim, activation=decoder_activation) decode_repete = K_layer.RepeatVector(self.partial_n) decode_repete_part = K_layer.RepeatVector(self.partial_ts) decode_residual_emb = RNN_UNIT(self.latent_dim / 2, return_sequences=True, activation=decoder_activation) #decode_residual_euler_1 = RNN_UNIT(self.latent_dim/4, return_sequences=True, activation=decoder_activation) decode_residual_euler_2 = RNN_UNIT(self.output_dim, return_sequences=True, activation=decoder_activation) def decode_angle(e): emb = decode_emb(e) emb_residual = decode_repete(e) emb_residual = decode_residual_emb(emb_residual) emb = K_layer.add([decode_repete(emb), emb_residual]) frames = [None] * self.timesteps for i in range(self.partial_n): e_ = K_layer.Lambda(lambda x: x[:, i], output_shape=(self.latent_dim / 2, ))(emb) frame = decode_euler_2(e_) for j in range(i * self.partial_ts, (i + 1) * self.partial_ts): frames[j] = frame frames = K_layer.concatenate(frames, axis=1) frames = K_layer.Reshape((self.timesteps, self.output_dim))(frames) emb = K_layer.Lambda( lambda x: K.repeat_elements(x, self.partial_ts, axis=1), output_shape=(self.timesteps, self.latent_dim / 2))(emb) residual = decode_residual_euler_2(emb) frames = K_layer.Activation(decoder_activation)(K_layer.add( [frames, residual])) return frames angles = [None] * self.partial_n for i in range(self.partial_n): e = K_layer.Lambda(lambda x: x[:, i], output_shape=(self.latent_dim, ))(encoded) angles[i] = decode_angle(e) decoded = K_layer.concatenate(angles, axis=1) decoded_ = decode_angle(z) self.encoder = Model(inputs, encoded) self.decoder = Model(z, decoded_) self.autoencoder = Model(inputs, decoded) opt = RMSprop(lr=L_RATE) def mse(yTrue, yPred): # yt = K.reshape(yTrue, (-1, self.timesteps, self.output_dim)) # yp = K.reshape(yPred, (-1, self.timesteps, self.output_dim)) a = yTrue b = yPred return tf.reduce_mean( tf.abs(tf.atan2(tf.sin(a - b), tf.cos(a - b)))) #loss = K.square(K.sin(yTrue) - K.sin(yPred)) #loss = loss + K.square(K.cos(yTrue) - K.cos(yPred)) #loss = K.mean(K.sqrt(loss)) #return loss self.autoencoder.compile(optimizer='Nadam', loss='mean_squared_error') self.autoencoder.summary() self.encoder.summary() self.decoder.summary()
# Shape of input to train on (note that model is fully convolutional however) input_shape = x_train.shape[1:] # The final list of the size of axis=1 for all layers, including input nfeats_all = [input_shape[0]] + nfeats # First build the encoder, all the while keeping track of the 'where' masks img_input = Input(shape=input_shape) # We push the 'where' masks to the following list wheres = [None] * nlayers y = img_input for i in range(nlayers): y_prepool = convresblock(y, nfeats=nfeats_all[i + 1], ksize=ksize) y = MaxPooling2D(pool_size=(pool_sizes[i], pool_sizes[i]))(y_prepool) wheres[i] = layers.Lambda(getwhere, output_shape=lambda x: x[0])([y_prepool, y]) # Now build the decoder, and use the stored 'where' masks to place the features for i in range(nlayers): ind = nlayers - 1 - i y = UpSampling2D(size=(pool_sizes[ind], pool_sizes[ind]))(y) y = layers.multiply([y, wheres[ind]]) y = convresblock(y, nfeats=nfeats_all[ind], ksize=ksize) # Use hard_simgoid to clip range of reconstruction y = Activation('hard_sigmoid')(y) # Define the model and it's mean square error loss, and compile it with Adam model = Model(img_input, y) model.compile('adam', 'mse')
def model_ContextSum(p, embedding_matrix, max_sent_len, n_out): print("Parameters:", p) # Take sentence encoded as indices and convert it to embeddings sentence_input = layers.Input(shape=(max_sent_len, ), dtype='int32', name='sentence_input') # Repeat the input N times for each edge x = layers.RepeatVector(MAX_EDGES_PER_GRAPH)(sentence_input) word_embeddings = layers.wrappers.TimeDistributed( layers.Embedding(output_dim=embedding_matrix.shape[1], input_dim=embedding_matrix.shape[0], input_length=max_sent_len, weights=[embeddings], mask_zero=True, trainable=False))(x) word_embeddings = layers.Dropout(p['dropout1'])(word_embeddings) # Take token markers that identify entity positions, convert to position embeddings entity_markers = layers.Input(shape=( MAX_EDGES_PER_GRAPH, max_sent_len, ), dtype='int8', name='entity_markers') pos_embeddings = layers.wrappers.TimeDistributed( layers.Embedding(output_dim=p['position_emb'], input_dim=POSITION_VOCAB_SIZE, input_length=max_sent_len, mask_zero=True, embeddings_regularizer=regularizers.l2(), trainable=True))(entity_markers) # Merge word and position embeddings and apply the specified amount of RNN layers for i in range(p["rnn1_layers"] - 1): lstm_layer = layers.LSTM(p['units1'], return_sequences=True) if p['bidirectional']: lstm_layer = layers.Bidirectional(lstm_layer) x = layers.wrappers.TimeDistributed(lstm_layer)(x) lstm_layer = layers.LSTM(p['units1'], return_sequences=False) if p['bidirectional']: lstm_layer = layers.Bidirectional(lstm_layer) sentence_matrix = layers.wrappers.TimeDistributed(lstm_layer)(x) # Take the vector of the sentences with the target entity pair layers_to_concat = [] num_units = p['units1'] * (2 if p['bidirectional'] else 1) for i in range(MAX_EDGES_PER_GRAPH): sentence_vector = layers.Lambda( lambda l: l[:, i], output_shape=(num_units, ))(sentence_matrix) if i == 0: context_vectors = layers.Lambda( lambda l: l[:, i + 1:], output_shape=(MAX_EDGES_PER_GRAPH - 1, num_units))(sentence_matrix) elif i == MAX_EDGES_PER_GRAPH - 1: context_vectors = layers.Lambda( lambda l: l[:, :i], output_shape=(MAX_EDGES_PER_GRAPH - 1, num_units))(sentence_matrix) else: context_vectors = layers.Lambda( lambda l: K.concatenate([l[:, :i], l[:, i + 1:]], axis=1), output_shape=(MAX_EDGES_PER_GRAPH - 1, num_units))(sentence_matrix) context_vector = GlobalSumPooling1D()(context_vectors) edge_vector = layers.concatenate([sentence_vector, context_vector]) edge_vector = layers.Reshape((1, num_units * 2))(edge_vector) layers_to_concat.append(edge_vector) edge_vectors = layers.Concatenate(1)(layers_to_concat) # Apply softmax edge_vectors = layers.Dropout(p['dropout1'])(edge_vectors) main_output = layers.wrappers.TimeDistributed( layers.Dense(n_out, activation="softmax", name='main_output'))(edge_vectors) model = models.Model(inputs=[sentence_input, entity_markers], outputs=[main_output]) model.compile(optimizer=p['optimizer'], loss=masked_categorical_crossentropy, metrics=['accuracy']) return model
def build(self): # image shape h, w, c = self.image_shape[:] print("image_shape: {}".format(self.image_shape)) if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): raise Exception( "Image size must be dividable by 2 at least 6 times " "to avoid fractions when downscaling and upscaling." "For example, use 256, 320, 384, 448, 512, ... etc. ") # Inputs input_image = kl.Input(shape=[None, None, c], name="input_image") input_image_meta = kl.Input(shape=[cfg.COMMON.IMAGE_META_SIZE], name="input_image_meta") # 训练 if self.train_flag: # RPN GT input_rpn_match = kl.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32) input_rpn_bbox = kl.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32) # Detection GT (class IDs, bounding boxes, and masks) # 1. GT Class IDs (zero padded) input_gt_class_ids = kl.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32) # 2. GT Boxes in pixels (zero padded) # [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates input_gt_boxes = kl.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32) # Normalize coordinates gt_boxes = kl.Lambda(lambda x: self.bbox_util.norm_boxes_graph( x, k.shape(input_image)[1:3]))(input_gt_boxes) # 3. GT Masks (zero padded) # [batch, height, width, MAX_GT_INSTANCES] if cfg.TRAIN.USE_MINI_MASK: min_h, min_w = cfg.TRAIN.MINI_MASK_SHAPE[:] input_gt_masks = kl.Input(shape=[min_h, min_w, None], name="input_gt_masks", dtype=bool) else: input_gt_masks = kl.Input(shape=[h, w, None], name="input_gt_masks", dtype=bool) pass # anchor anchors = self.anchor_utils.get_anchors(self.image_shape) # Duplicate across the batch dimension because Keras requires it # TODO: can this be optimized to avoid duplicating the anchors? anchors = np.broadcast_to(anchors, (self.batch_size, ) + anchors.shape) # A hack to get around Keras's bad support for constants anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) pass else: # Anchors in normalized coordinates anchors = kl.Input(shape=[None, 4], name="input_anchors") # 上面训练用到的参数,测试不需要,但是在 if else 里面定义一下,免得 undefined input_rpn_match = None input_rpn_bbox = None input_gt_class_ids = None gt_boxes = None input_gt_boxes = None input_gt_masks = None pass # Build the shared convolutional layers. # Bottom-up Layers # Returns a list of the last layers of each stage, 5 in total. # Don't create the thead (stage 5), so we pick the 4th item in the list. _, c2, c3, c4, c5 = backbone.resnet_graph(input_image, self.backbone, stage5=True) # Top-down Layers # TODO: add assert to varify feature map sizes match what's in config p5 = kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c5p5')(c5) p4 = kl.Add(name="fpn_p4add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(p5), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c4p4')(c4) ]) p3 = kl.Add(name="fpn_p3add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(p4), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c3p3')(c3) ]) p2 = kl.Add(name="fpn_p2add")([ kl.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(p3), kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c2p2')(c2) ]) # Attach 3x3 conv to all P layers to get the final feature maps. p2 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p2")(p2) p3 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p3")(p3) p4 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p4")(p4) p5 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p5")(p5) # P6 is used for the 5th anchor scale in RPN. Generated by # subsampling from P5 with stride of 2. p6 = kl.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(p5) # Note that P6 is used in RPN, but not in the classifier heads. rpn_feature_maps = [p2, p3, p4, p5, p6] mrcnn_feature_maps = [p2, p3, p4, p5] # RPN Model rpn = common.build_rpn_model(self.rpn_anchor_stride, len(self.rpn_anchor_ratios), self.top_down_pyramid_size) # Loop through pyramid layers layer_outputs = [] # list of lists for p in rpn_feature_maps: layer_outputs.append(rpn([p])) pass # Concatenate layer outputs # Convert from list of lists of level outputs to list of lists # of outputs across levels. # e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]] output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] outputs = list(zip(*layer_outputs)) outputs = [ kl.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names) ] rpn_class_logits, rpn_class, rpn_bbox = outputs # Generate proposals # Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates # and zero padded. proposal_count = cfg.TRAIN.POST_NMS_ROIS if self.train_flag else cfg.TEST.POST_NMS_ROIS rpn_rois = common.ProposalLayer( proposal_count=proposal_count, nms_threshold=self.rpn_nms_threshold, batch_size=self.batch_size, name="ROI")([rpn_class, rpn_bbox, anchors]) fc_layer_size = cfg.COMMON.FPN_CLASS_FC_LAYERS_SIZE pool_size = cfg.COMMON.POOL_SIZE mask_pool_size = cfg.COMMON.MASK_POOL_SIZE train_or_freeze = cfg.COMMON.TRAIN_FLAG if self.train_flag: # Class ID mask to mark class IDs supported by the dataset the image # came from. active_class_ids = kl.Lambda( lambda x: self.image_utils.parse_image_meta_graph(x)[ "active_class_ids"])(input_image_meta) if not cfg.TRAIN.USE_RPN_ROIS: # Ignore predicted ROIs and use ROIs provided as an input. input_rois = kl.Input(shape=[proposal_count, 4], name="input_roi", dtype=np.int32) # Normalize coordinates target_rois = kl.Lambda( lambda x: self.bbox_util.norm_boxes_graph( x, k.shape(input_image)[1:3]))(input_rois) else: target_rois = rpn_rois input_rois = None # Generate detection targets # Subsamples proposals and generates target outputs for training # Note that proposal class IDs, gt_boxes, and gt_masks are zero # padded. Equally, returned rois and targets are zero padded. rois, target_class_ids, target_bbox, target_mask = \ common.DetectionTargetLayer(self.batch_size, name="proposal_targets")([ target_rois, input_gt_class_ids, gt_boxes, input_gt_masks]) # Network Heads # TODO: verify that this handles zero padded ROIs mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph( rois, mrcnn_feature_maps, input_image_meta, pool_size, self.class_num, train_flag=train_or_freeze, fc_layers_size=fc_layer_size) mrcnn_mask = common.build_fpn_mask_graph( rois, mrcnn_feature_maps, input_image_meta, mask_pool_size, self.class_num, train_flag=train_or_freeze) # TODO: clean up (use tf.identify if necessary) output_rois = kl.Lambda(lambda x: x * 1, name="output_rois")(rois) # Losses rpn_class_loss = kl.Lambda( lambda x: common.rpn_class_loss_graph(*x), name="rpn_class_loss")([input_rpn_match, rpn_class_logits]) rpn_bbox_loss = kl.Lambda( lambda x: common.rpn_bbox_loss_graph(self.batch_size, *x), name="rpn_bbox_loss")( [input_rpn_bbox, input_rpn_match, rpn_bbox]) class_loss = kl.Lambda(lambda x: common.mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([ target_class_ids, mrcnn_class_logits, active_class_ids ]) bbox_loss = kl.Lambda(lambda x: common.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([ target_bbox, target_class_ids, mrcnn_bbox ]) mask_loss = kl.Lambda(lambda x: common.mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")([ target_mask, target_class_ids, mrcnn_mask ]) # Model inputs = [ input_image, input_image_meta, input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks ] if not cfg.TRAIN.USE_RPN_ROIS: inputs.append(input_rois) outputs = [ rpn_class_logits, rpn_class, rpn_bbox, mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, output_rois, rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss ] model = km.Model(inputs, outputs, name='mask_rcnn') pass else: # Network Heads # Proposal classifier and BBox regressor heads mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph( rpn_rois, mrcnn_feature_maps, input_image_meta, pool_size, self.class_num, train_flag=train_or_freeze, fc_layers_size=fc_layer_size) # Detections # output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in # normalized coordinates detections = common.DetectionLayer(self.batch_size, name="mrcnn_detection")([ rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta ]) # Create masks for detections detection_boxes = kl.Lambda(lambda x: x[..., :4])(detections) mrcnn_mask = common.build_fpn_mask_graph( detection_boxes, mrcnn_feature_maps, input_image_meta, mask_pool_size, self.class_num, train_flag=train_or_freeze) model = km.Model([input_image, input_image_meta, anchors], [ detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox ], name='mask_rcnn') pass # Add multi-GPU support. 多 GPU 操作 gpu_count = cfg.COMMON.GPU_COUNT if gpu_count > 1: from m_rcnn.parallel_model import ParallelModel model = ParallelModel(model, gpu_count) return model pass
def model_ContextWeighted(p, embedding_matrix, max_sent_len, n_out): print("Parameters:", p) # Take sentence encoded as indices and convert it to embeddings sentence_input = layers.Input(shape=(max_sent_len, ), dtype='int32', name='sentence_input') # Repeat the input N times for each edge x = layers.RepeatVector(MAX_EDGES_PER_GRAPH)(sentence_input) word_embeddings = layers.wrappers.TimeDistributed( layers.Embedding(output_dim=embedding_matrix.shape[1], input_dim=embedding_matrix.shape[0], input_length=max_sent_len, weights=[embedding_matrix], mask_zero=True, trainable=False))(x) word_embeddings = layers.Dropout(p['dropout1'])(word_embeddings) # Take token markers that identify entity positions, convert to position embeddings entity_markers = layers.Input(shape=( MAX_EDGES_PER_GRAPH, max_sent_len, ), dtype='int8', name='entity_markers') pos_embeddings = layers.wrappers.TimeDistributed( layers.Embedding(output_dim=p['position_emb'], input_dim=POSITION_VOCAB_SIZE, input_length=max_sent_len, mask_zero=True, embeddings_regularizer=regularizers.l2(), trainable=True))(entity_markers) # Merge word and position embeddings and apply the specified amount of RNN layers x = layers.concatenate([word_embeddings, pos_embeddings]) for i in range(p["rnn1_layers"] - 1): lstm_layer = layers.LSTM(p['units1'], return_sequences=True) if p['bidirectional']: lstm_layer = layers.Bidirectional(lstm_layer) x = layers.wrappers.TimeDistributed(lstm_layer)(x) lstm_layer = layers.LSTM(p['units1'], return_sequences=False) if p['bidirectional']: lstm_layer = layers.Bidirectional(lstm_layer) sentence_matrix = layers.wrappers.TimeDistributed(lstm_layer)(x) ### Attention over ghosts ### layers_to_concat = [] num_units = p['units1'] * (2 if p['bidirectional'] else 1) for i in range(MAX_EDGES_PER_GRAPH): # Compute a memory vector for the target entity pair sentence_vector = layers.Lambda( lambda l: l[:, i], output_shape=(num_units, ))(sentence_matrix) target_sentence_memory = layers.Dense(num_units, activation="linear", use_bias=False)(sentence_vector) if i == 0: context_vectors = layers.Lambda( lambda l: l[:, i + 1:], output_shape=(MAX_EDGES_PER_GRAPH - 1, num_units))(sentence_matrix) elif i == MAX_EDGES_PER_GRAPH - 1: context_vectors = layers.Lambda( lambda l: l[:, :i], output_shape=(MAX_EDGES_PER_GRAPH - 1, num_units))(sentence_matrix) else: context_vectors = layers.Lambda( lambda l: K.concatenate([l[:, :i], l[:, i + 1:]], axis=1), output_shape=(MAX_EDGES_PER_GRAPH - 1, num_units))(sentence_matrix) # Compute the score between each memory and the memory of the target entity pair sentence_scores = layers.Lambda( lambda inputs: K.batch_dot(inputs[0], inputs[1], axes=(1, 2)), output_shape=(MAX_EDGES_PER_GRAPH, ))( [target_sentence_memory, context_vectors]) sentence_scores = layers.Activation('softmax')(sentence_scores) # Compute the final vector by taking the weighted sum of context vectors and the target entity vector context_vector = layers.Lambda( lambda inputs: K.batch_dot(inputs[0], inputs[1], axes=(1, 1)), output_shape=(num_units, ))([context_vectors, sentence_scores]) edge_vector = layers.concatenate([sentence_vector, context_vector]) edge_vector = layers.Reshape((1, num_units * 2))(edge_vector) layers_to_concat.append(edge_vector) edge_vectors = layers.concatenate(layers_to_concat, axis=1) # Apply softmax edge_vectors = layers.Dropout(p['dropout1'])(edge_vectors) main_output = layers.wrappers.TimeDistributed( layers.Dense(n_out, activation="softmax", name='main_output'))(edge_vectors) model = models.Model(inputs=[sentence_input, entity_markers], outputs=[main_output]) optimizer = optimizers.Adam(lr=0.001) model.compile(optimizer=optimizer, loss=masked_categorical_crossentropy, metrics=['accuracy']) return model
def add_dim(tensor): """Add a dimension to tensors that don't have any.""" if K.int_shape(tensor) == (): return KL.Lambda(lambda t: K.reshape(t, [1, 1]))( tensor) return tensor
def YOLOv4(inputs, num_classes, num_anchors, initial_filters=32, fast=False, anchors=None, conf_thresh=0.05, nms_thresh=0.45, keep_top_k=100, nms_top_k=100): i32 = initial_filters i64 = i32 * 2 i128 = i32 * 4 i256 = i32 * 8 i512 = i32 * 16 i1024 = i32 * 32 if fast: # x = PreLayer()(inputs) x = inputs else: x = inputs # cspdarknet53部分 x = conv2d_unit(x, i32, 3, strides=1, padding='same') # ============================= s2 ============================= x = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(x) x = conv2d_unit(x, i64, 3, strides=2) s2 = conv2d_unit(x, i64, 1, strides=1) x = conv2d_unit(x, i64, 1, strides=1) x = stack_residual_block(x, i32, i64, n=1) x = conv2d_unit(x, i64, 1, strides=1) x = layers.Concatenate()([x, s2]) s2 = conv2d_unit(x, i64, 1, strides=1) # ============================= s4 ============================= x = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(s2) x = conv2d_unit(x, i128, 3, strides=2) s4 = conv2d_unit(x, i64, 1, strides=1) x = conv2d_unit(x, i64, 1, strides=1) x = stack_residual_block(x, i64, i64, n=2) x = conv2d_unit(x, i64, 1, strides=1) x = layers.Concatenate()([x, s4]) s4 = conv2d_unit(x, i128, 1, strides=1) # ============================= s8 ============================= x = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(s4) x = conv2d_unit(x, i256, 3, strides=2) s8 = conv2d_unit(x, i128, 1, strides=1) x = conv2d_unit(x, i128, 1, strides=1) x = stack_residual_block(x, i128, i128, n=8) x = conv2d_unit(x, i128, 1, strides=1) x = layers.Concatenate()([x, s8]) s8 = conv2d_unit(x, i256, 1, strides=1) # ============================= s16 ============================= x = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(s8) x = conv2d_unit(x, i512, 3, strides=2) s16 = conv2d_unit(x, i256, 1, strides=1) x = conv2d_unit(x, i256, 1, strides=1) x = stack_residual_block(x, i256, i256, n=8) x = conv2d_unit(x, i256, 1, strides=1) x = layers.Concatenate()([x, s16]) s16 = conv2d_unit(x, i512, 1, strides=1) # ============================= s32 ============================= x = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(s16) x = conv2d_unit(x, i1024, 3, strides=2) s32 = conv2d_unit(x, i512, 1, strides=1) x = conv2d_unit(x, i512, 1, strides=1) x = stack_residual_block(x, i512, i512, n=4) x = conv2d_unit(x, i512, 1, strides=1) x = layers.Concatenate()([x, s32]) s32 = conv2d_unit(x, i1024, 1, strides=1) # cspdarknet53部分结束 # fpn部分 x = conv2d_unit(s32, i512, 1, strides=1, act='leaky') x = conv2d_unit(x, i1024, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i512, 1, strides=1, act='leaky') x = spp(x) x = conv2d_unit(x, i512, 1, strides=1, act='leaky') x = conv2d_unit(x, i1024, 3, strides=1, padding='same', act='leaky') fpn_s32 = conv2d_unit(x, i512, 1, strides=1, act='leaky') # pan01 x = conv2d_unit(fpn_s32, i256, 1, strides=1, act='leaky') x = layers.UpSampling2D(2)(x) s16 = conv2d_unit(s16, i256, 1, strides=1, act='leaky') x = layers.Concatenate()([s16, x]) x = conv2d_unit(x, i256, 1, strides=1, act='leaky') x = conv2d_unit(x, i512, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i256, 1, strides=1, act='leaky') x = conv2d_unit(x, i512, 3, strides=1, padding='same', act='leaky') fpn_s16 = conv2d_unit(x, i256, 1, strides=1, act='leaky') # pan01结束 # pan02 x = conv2d_unit(fpn_s16, i128, 1, strides=1, act='leaky') x = layers.UpSampling2D(2)(x) s8 = conv2d_unit(s8, i128, 1, strides=1, act='leaky') x = layers.Concatenate()([s8, x]) x = conv2d_unit(x, i128, 1, strides=1, act='leaky') x = conv2d_unit(x, i256, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i128, 1, strides=1, act='leaky') x = conv2d_unit(x, i256, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i128, 1, strides=1, act='leaky') # pan02结束 # output_s, 不用concat() output_s = conv2d_unit(x, i256, 3, strides=1, padding='same', act='leaky') output_s = conv2d_unit(output_s, num_anchors * (num_classes + 5), 1, strides=1, bn=0, act=None) # output_m, 需要concat() x = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(x) x = conv2d_unit(x, i256, 3, strides=2, act='leaky') x = layers.Concatenate()([x, fpn_s16]) x = conv2d_unit(x, i256, 1, strides=1, act='leaky') x = conv2d_unit(x, i512, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i256, 1, strides=1, act='leaky') x = conv2d_unit(x, i512, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i256, 1, strides=1, act='leaky') output_m = conv2d_unit(x, i512, 3, strides=1, padding='same', act='leaky') output_m = conv2d_unit(output_m, num_anchors * (num_classes + 5), 1, strides=1, bn=0, act=None) # output_l, 需要concat() x = layers.ZeroPadding2D(padding=((1, 0), (1, 0)))(x) x = conv2d_unit(x, i512, 3, strides=2, act='leaky') x = layers.Concatenate()([x, fpn_s32]) x = conv2d_unit(x, i512, 1, strides=1, act='leaky') x = conv2d_unit(x, i1024, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i512, 1, strides=1, act='leaky') x = conv2d_unit(x, i1024, 3, strides=1, padding='same', act='leaky') x = conv2d_unit(x, i512, 1, strides=1, act='leaky') output_l = conv2d_unit(x, i1024, 3, strides=1, padding='same', act='leaky') output_l = conv2d_unit(output_l, num_anchors * (num_classes + 5), 1, strides=1, bn=0, act=None) # 用张量操作实现后处理 if fast: def output_layer(args): output_s, output_m, output_l = args # 先对坐标解码 pred_xywh_s, pred_conf_s, pred_prob_s = decode( output_s, anchors[0], 8, num_classes) pred_xywh_m, pred_conf_m, pred_prob_m = decode( output_m, anchors[1], 16, num_classes) pred_xywh_l, pred_conf_l, pred_prob_l = decode( output_l, anchors[2], 32, num_classes) # 获取分数 pred_score_s = pred_conf_s * pred_prob_s pred_score_m = pred_conf_m * pred_prob_m pred_score_l = pred_conf_l * pred_prob_l # 所有输出层的预测框集合后再执行nms all_pred_boxes = tf.concat([pred_xywh_s, pred_xywh_m, pred_xywh_l], axis=1) # [batch_size, -1, 4] all_pred_scores = tf.concat( [pred_score_s, pred_score_m, pred_score_l], axis=1) # [batch_size, -1, 80] # 用fastnms output = fastnms(all_pred_boxes, all_pred_scores, conf_thresh, nms_thresh, keep_top_k, nms_top_k) return output output = layers.Lambda(output_layer)([output_s, output_m, output_l]) model_body = keras.models.Model(inputs=inputs, outputs=output) else: model_body = keras.models.Model(inputs=inputs, outputs=[output_l, output_m, output_s]) return model_body
# %% # Build our model # We create a function to integrate the tensorflow model with a Keras model # This requires explicitly casting the tensor to a string, because of a Keras quirk def ElmoEmbedding(x): return elmo_model(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"] input_text = layers.Input(shape=(1, ), dtype=tf.string) x = layers.Lambda(ElmoEmbedding, output_shape=(1024, ))(input_text) x = layers.Dense(256, activation='relu')(x) x = layers.Dense(1, activation='sigmoid')(x) model = Model(inputs=[input_text], outputs=x) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() # %% # Create datasets (Only take up to 150 words for memory) train_text = train_df['sentence'].tolist() train_text = [' '.join(t.split()[0:150]) for t in train_text] train_text = np.array(train_text, dtype=object)[:, np.newaxis]
def test_lambda(self): x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5])) y = layers.Lambda(lambda x: x**2)(x)