def build_relational(obs_spec, act_spec, data_format='channels_first', broadcast_non_spatial=False): # https://github.com/deepmind/pysc2/blob/master/docs/environment.md#last-actions # obs_spec: screen, minimap, player (11,), last_actions (n,) # At each time step agents are presented with 4 sources of information: # minimap, screen, player, and previous-action. assert broadcast_non_spatial is False, 'broadcast_non_spatial should be false for relational agents' batch_size = None channel_3 = 16 channel_2 = 96 # TODO: set spatial_dim <- 64 screen, screen_input = spatial_block('screen', obs_spec.spaces[0], conv_cfg(data_format, 'relu'), batch_size=batch_size) minimap, minimap_input = spatial_block('minimap', obs_spec.spaces[1], conv_cfg(data_format, 'relu'), batch_size=batch_size) # TODO: obs_spec[2:] <- ['available_actions', 'player', 'last_actions'] non_spatial_inputs_list = [ Input(s.shape, batch_size=batch_size) for s in obs_spec.spaces[2:] ] available_actions = non_spatial_inputs_list[0] non_spatial_inputs = Concatenate(axis=1, name='non_spatial_inputs')( non_spatial_inputs_list[1:]) # input_2d: [30, 64], input_3d: [30, 9, 8, 8] input_2d = _mlp2(Flatten()(non_spatial_inputs), units=[128, 64], cfg=dense_cfg('relu')) input_3d = Concatenate(axis=1, name="state_block")([screen, minimap]) # TODO: treat channel_x as parameters or read from configuration files class ExpandDims(Lambda): def __init__(self, axis): Lambda.__init__(self, lambda x: tf.expand_dims(x, axis)) # input_3d = ExpandDims(axis=1)(input_3d) # # output_3d: [30, 96, 8, 8] # # TODO: unroll length # output_3d = ConvLSTM2D( # filters=channel_2, # kernel_size=3, # stateful=True, # **conv2dlstm_cfg() # )(input_3d) output_3d = Conv2D(32, 3, **conv_cfg(data_format, 'relu'))(input_3d) output_3d = Conv2D(96, 3, **conv_cfg(data_format, 'relu'))(output_3d) # relational_spatial: [30, 32, 8, 8] relational_spatial = _resnet12(output_3d, filters=[64, 48, 32, 32], cfg=conv_cfg(data_format, 'relu')) # relational_spatial: [30, 16, 32, 32] relational_spatial = _deconv4x(relational_spatial, filters=[channel_3, channel_3], kernel_sizes=[4, 4], cfg=deconv_cfg(data_format, 'relu')) # TODO: check scale factor # relational_nonspatial: [30, 512] relational_nonspatial = _mlp2(Flatten()(output_3d), units=[512, 512], cfg=dense_cfg('relu')) # shared_features: [30, 512+64=576] shared_features = Concatenate(axis=1, name='shared_features')( [relational_nonspatial, input_2d]) # [512+64, ] # [30,] value = _mlp2(shared_features, units=[256, 1], cfg=dense_cfg('relu', scale=0.1)) value = Squeeze(axis=-1)(value) # [30, #actions=549] policy_logits = _mlp2(shared_features, units=[512, list(act_spec)[0].size()], cfg=dense_cfg('relu', scale=0.1)) mask_actions = Lambda( lambda x: tf.where(available_actions > 0, x, -1000 * tf.ones_like(x)), name="mask_unavailable_action_ids") policy_logits = mask_actions(policy_logits) # TODO: check return Model( inputs=[screen_input, minimap_input] + non_spatial_inputs_list, outputs=[shared_features, policy_logits, relational_spatial, value])
def siamese_model(input_shape=None, track_length=1, features=None, neighborhood_scale_size=10, reg=1e-5, init='he_normal', softmax=True, norm_method='std', filter_size=61): def compute_input_shape(feature): if feature == 'appearance': return input_shape elif feature == 'distance': return (None, 2) elif feature == 'neighborhood': return (None, 2 * neighborhood_scale_size + 1, 2 * neighborhood_scale_size + 1, 1) elif feature == 'regionprop': return (None, 3) else: raise ValueError('siamese_model.compute_input_shape: ' 'Unknown feature `{}`'.format(feature)) def compute_reshape(feature): if feature == 'appearance': return (64,) elif feature == 'distance': return (2,) elif feature == 'neighborhood': return (64,) elif feature == 'regionprop': return (3,) else: raise ValueError('siamese_model.compute_output_shape: ' 'Unknown feature `{}`'.format(feature)) def compute_feature_extractor(feature, shape): if feature == 'appearance': # This should not stay: channels_first/last should be used to # dictate size (1 works for either right now) N_layers = np.int(np.floor(np.log2(input_shape[1]))) feature_extractor = Sequential() feature_extractor.add(InputLayer(input_shape=shape)) # feature_extractor.add(ImageNormalization2D(norm_method='std', filter_size=32)) for layer in range(N_layers): feature_extractor.add(Conv3D(64, (1, 3, 3), kernel_initializer=init, padding='same', kernel_regularizer=l2(reg))) feature_extractor.add(BatchNormalization(axis=channel_axis)) feature_extractor.add(Activation('relu')) feature_extractor.add(MaxPool3D(pool_size=(1, 2, 2))) feature_extractor.add(Reshape((-1, 64))) return feature_extractor elif feature == 'distance': return None elif feature == 'neighborhood': N_layers_og = np.int(np.floor(np.log2(2 * neighborhood_scale_size + 1))) feature_extractor_neighborhood = Sequential() feature_extractor_neighborhood.add( InputLayer(input_shape=(None, 2 * neighborhood_scale_size + 1, 2 * neighborhood_scale_size + 1, 1)) ) for layer in range(N_layers_og): feature_extractor_neighborhood.add(Conv3D(64, (1, 3, 3), kernel_initializer=init, padding='same', kernel_regularizer=l2(reg))) feature_extractor_neighborhood.add(BatchNormalization(axis=channel_axis)) feature_extractor_neighborhood.add(Activation('relu')) feature_extractor_neighborhood.add(MaxPool3D(pool_size=(1, 2, 2))) feature_extractor_neighborhood.add(Reshape((-1, 64))) return feature_extractor_neighborhood elif feature == 'regionprop': return None else: raise ValueError('siamese_model.compute_feature_extractor: ' 'Unknown feature `{}`'.format(feature)) if features is None: raise ValueError('siamese_model: No features specified.') if K.image_data_format() == 'channels_first': channel_axis = 1 input_shape = (input_shape[0], None, *input_shape[1:]) else: channel_axis = -1 input_shape = (None, *input_shape) features = sorted(features) inputs = [] outputs = [] for feature in features: in_shape = compute_input_shape(feature) re_shape = compute_reshape(feature) feature_extractor = compute_feature_extractor(feature, in_shape) layer_1 = Input(shape=in_shape) layer_2 = Input(shape=in_shape) inputs.extend([layer_1, layer_2]) # apply feature_extractor if it exists if feature_extractor is not None: layer_1 = feature_extractor(layer_1) layer_2 = feature_extractor(layer_2) # LSTM on 'left' side of network since that side takes in stacks of features layer_1 = LSTM(64)(layer_1) layer_2 = Reshape(re_shape)(layer_2) outputs.append([layer_1, layer_2]) dense_merged = [] for layer_1, layer_2 in outputs: merge = Concatenate(axis=channel_axis)([layer_1, layer_2]) dense_merge = Dense(128)(merge) bn_merge = BatchNormalization(axis=channel_axis)(dense_merge) dense_relu = Activation('relu')(bn_merge) dense_merged.append(dense_relu) # Concatenate outputs from both instances merged_outputs = Concatenate(axis=channel_axis)(dense_merged) # Add dense layers dense1 = Dense(128)(merged_outputs) bn1 = BatchNormalization(axis=channel_axis)(dense1) relu1 = Activation('relu')(bn1) dense2 = Dense(128)(relu1) bn2 = BatchNormalization(axis=channel_axis)(dense2) relu2 = Activation('relu')(bn2) dense3 = Dense(3, activation='softmax')(relu2) # Instantiate model final_layer = dense3 model = Model(inputs=inputs, outputs=final_layer) return model
def bn_feature_net_2D(receptive_field=61, input_shape=(256, 256, 1), n_features=3, n_channels=1, reg=1e-5, n_conv_filters=64, n_dense_filters=200, VGG_mode=False, init='he_normal', norm_method='std', location=False, dilated=False, padding=False, padding_mode='reflect', multires=False, include_top=True): # Create layers list (x) to store all of the layers. # We need to use the functional API to enable the multiresolution mode x = [] win = (receptive_field - 1) // 2 if dilated: padding = True if K.image_data_format() == 'channels_first': channel_axis = 1 row_axis = 2 col_axis = 3 if not dilated: input_shape = (n_channels, receptive_field, receptive_field) else: row_axis = 1 col_axis = 2 channel_axis = -1 if not dilated: input_shape = (receptive_field, receptive_field, n_channels) x.append(Input(shape=input_shape)) x.append(ImageNormalization2D(norm_method=norm_method, filter_size=receptive_field)(x[-1])) if padding: if padding_mode == 'reflect': x.append(ReflectionPadding2D(padding=(win, win))(x[-1])) elif padding_mode == 'zero': x.append(ZeroPadding2D(padding=(win, win))(x[-1])) if location: x.append(Location2D(in_shape=tuple(x[-1].shape.as_list()[1:]))(x[-1])) x.append(Concatenate(axis=channel_axis)([x[-2], x[-1]])) if multires: layers_to_concat = [] rf_counter = receptive_field block_counter = 0 d = 1 while rf_counter > 4: filter_size = 3 if rf_counter % 2 == 0 else 4 x.append(Conv2D(n_conv_filters, (filter_size, filter_size), dilation_rate=d, kernel_initializer=init, padding='valid', kernel_regularizer=l2(reg))(x[-1])) x.append(BatchNormalization(axis=channel_axis)(x[-1])) x.append(Activation('relu')(x[-1])) block_counter += 1 rf_counter -= filter_size - 1 if block_counter % 2 == 0: if dilated: x.append(DilatedMaxPool2D(dilation_rate=d, pool_size=(2, 2))(x[-1])) d *= 2 else: x.append(MaxPool2D(pool_size=(2, 2))(x[-1])) if VGG_mode: n_conv_filters *= 2 rf_counter = rf_counter // 2 if multires: layers_to_concat.append(len(x) - 1) if multires: c = [] for l in layers_to_concat: output_shape = x[l].get_shape().as_list() target_shape = x[-1].get_shape().as_list() row_crop = int(output_shape[row_axis] - target_shape[row_axis]) if row_crop % 2 == 0: row_crop = (row_crop // 2, row_crop // 2) else: row_crop = (row_crop // 2, row_crop // 2 + 1) col_crop = int(output_shape[col_axis] - target_shape[col_axis]) if col_crop % 2 == 0: col_crop = (col_crop // 2, col_crop // 2) else: col_crop = (col_crop // 2, col_crop // 2 + 1) cropping = (row_crop, col_crop) c.append(Cropping2D(cropping=cropping)(x[l])) x.append(Concatenate(axis=channel_axis)(c)) x.append(Conv2D(n_dense_filters, (rf_counter, rf_counter), dilation_rate=d, kernel_initializer=init, padding='valid', kernel_regularizer=l2(reg))(x[-1])) x.append(BatchNormalization(axis=channel_axis)(x[-1])) x.append(Activation('relu')(x[-1])) x.append(TensorProduct(n_dense_filters, kernel_initializer=init, kernel_regularizer=l2(reg))(x[-1])) x.append(BatchNormalization(axis=channel_axis)(x[-1])) x.append(Activation('relu')(x[-1])) x.append(TensorProduct(n_features, kernel_initializer=init, kernel_regularizer=l2(reg))(x[-1])) if not dilated: x.append(Flatten()(x[-1])) if include_top: x.append(Softmax(axis=channel_axis)(x[-1])) model = Model(inputs=x[0], outputs=x[-1]) return model
def RetinaMask(backbone, num_classes, input_shape, inputs=None, backbone_levels=['C3', 'C4', 'C5'], pyramid_levels=['P3', 'P4', 'P5', 'P6', 'P7'], norm_method='whole_image', location=False, use_imagenet=False, crop_size=(14, 14), pooling=None, mask_dtype=K.floatx(), required_channels=3, frames_per_batch=1, **kwargs): """Constructs a mrcnn model using a backbone from keras-applications. Args: backbone (str): Name of backbone to use. num_classes (int): Number of classes to classify. input_shape (tuple): The shape of the input data. weights (str): one of None (random initialization), 'imagenet' (pre-training on ImageNet), or the path to the weights file to be loaded. pooling (str): optional pooling mode for feature extraction when include_top is False. - None means that the output of the model will be the 4D tensor output of the last convolutional layer. - 'avg' means that global average pooling will be applied to the output of the last convolutional layer, and thus the output of the model will be a 2D tensor. - 'max' means that global max pooling will be applied. required_channels (int): The required number of channels of the backbone. 3 is the default for all current backbones. Returns: tensorflow.keras.Model: RetinaNet model with a backbone. """ channel_axis = 1 if K.image_data_format() == 'channels_first' else -1 if inputs is None: if frames_per_batch > 1: if channel_axis == 1: input_shape_with_time = tuple( [input_shape[0], frames_per_batch] + list(input_shape)[1:]) else: input_shape_with_time = tuple([frames_per_batch] + list(input_shape)) inputs = Input(shape=input_shape_with_time) else: inputs = Input(shape=input_shape) if location: if frames_per_batch > 1: # TODO: TimeDistributed is incompatible with channels_first loc = TimeDistributed(Location2D(in_shape=input_shape))(inputs) else: loc = Location2D(in_shape=input_shape)(inputs) concat = Concatenate(axis=channel_axis)([inputs, loc]) else: concat = inputs # force the channel size for backbone input to be `required_channels` if frames_per_batch > 1: norm = TimeDistributed( ImageNormalization2D(norm_method=norm_method))(concat) fixed_inputs = TimeDistributed(TensorProduct(required_channels))(norm) else: norm = ImageNormalization2D(norm_method=norm_method)(concat) fixed_inputs = TensorProduct(required_channels)(norm) # force the input shape axis = 0 if K.image_data_format() == 'channels_first' else -1 fixed_input_shape = list(input_shape) fixed_input_shape[axis] = required_channels fixed_input_shape = tuple(fixed_input_shape) model_kwargs = { 'include_top': False, 'weights': None, 'input_shape': fixed_input_shape, 'pooling': pooling } _, backbone_dict = get_backbone(backbone, fixed_inputs, use_imagenet=use_imagenet, frames_per_batch=frames_per_batch, return_dict=True, **model_kwargs) # create the full model return retinanet_mask(inputs=inputs, num_classes=num_classes, backbone_dict=backbone_dict, crop_size=crop_size, backbone_levels=backbone_levels, pyramid_levels=pyramid_levels, name='{}_retinanet_mask'.format(backbone), mask_dtype=mask_dtype, frames_per_batch=frames_per_batch, **kwargs)