def preact_residual_dmixconv_block(data, channels, channels_operating, name, kernels=None, act_type='relu', use_se=True): """ Returns a residual block without any max pooling operation :param data: Input data :param channels: Number of filters for all CNN-layers :param name: Name for the residual block :param act_type: Activation function to use :return: symbol """ bn1 = mx.sym.BatchNorm(data=data, name=name + '_bn1') conv1 = mx.sym.Convolution(data=bn1, num_filter=channels_operating, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv1') bn2 = mx.sym.BatchNorm(data=conv1, name=name + '_bn2') act1 = get_act(data=bn2, act_type=act_type, name=name + '_act1') conv2 = mix_conv(data=act1, channels=channels_operating, kernels=kernels, name=name + 'conv2') bn3 = mx.sym.BatchNorm(data=conv2, name=name + '_bn3') act2 = get_act(data=bn3, act_type=act_type, name=name + '_act2') out = mx.sym.Convolution(data=act2, num_filter=channels, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv3') # out = mx.sym.BatchNorm(data=conv3, name=name + '_bn4') if use_se: out = channel_squeeze_excitation(out, channels, name=name + '_se', ratio=4, act_type=act_type, use_hard_sigmoid=True) out_sum = mx.sym.broadcast_add(data, out, name=name + '_add') return out_sum
def bottleneck_residual_block_v2(data, channels, channels_operating, name, kernel, act_type='relu', norm_type="bn", se_type=None): """ Returns a residual block without any max pooling operation :param data: Input data :param channels: Number of filters for all CNN-layers :param name: Name for the residual block :param act_type: Activation function to use :param se_ratio: Squeeze excitation ratio :param use_se: Boolean if a squeeze excitation module will be used :param se_type: Squeeze excitation module type. Available [None, "se", "cbam", "ca_se", "cm_se", "sa_se", "sm_se"] :return: symbol """ if se_type: next_input = get_se_layer(data, channels, se_type, name=name + '_se', use_hard_sigmoid=True) else: next_input = data conv1 = mx.sym.Convolution(data=next_input, num_filter=channels_operating, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv1') bn1 = get_norm_layer(data=conv1, norm_type=norm_type, name=name + '_bn1') act1 = get_act(data=bn1, act_type=act_type, name=name + '_act1') conv2 = mx.sym.Convolution(data=act1, num_filter=channels_operating, kernel=(kernel, kernel), stride=(1, 1), num_group=channels_operating, pad=(kernel // 2, kernel // 2), no_bias=True, name=name + '_conv2') bn2 = get_norm_layer(data=conv2, norm_type=norm_type, name=name + '_bn2') act2 = get_act(data=bn2, act_type=act_type, name=name + '_act2') conv3 = mx.sym.Convolution(data=act2, num_filter=channels, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv3') bn3 = get_norm_layer(data=conv3, norm_type=norm_type, name=name + '_bn3') sum_out = mx.sym.broadcast_add(bn3, data, name=name + '_add') return sum_out
def sandglass_block(data, channels, channels_reduced, name, kernel, act_type='relu', norm_type="bn", se_type="eca_se"): """ Rethinking Bottleneck Structure for EfficientMobile Network Design, D. Zhou and Q. Hou et al. """ first_kernel = kernel conv1 = mx.sym.Convolution(data=data, num_filter=channels, kernel=(first_kernel, first_kernel), pad=(first_kernel // 2, first_kernel // 2), num_group=channels, no_bias=True, name=name + '_conv1') bn1 = get_norm_layer(data=conv1, norm_type=norm_type, name=name + '_bn1') act1 = get_act(data=bn1, act_type=act_type, name=name + '_act1') if se_type: next_input = get_se_layer(act1, channels, se_type, name=name + '_se', use_hard_sigmoid=True) else: next_input = act1 conv2 = mx.sym.Convolution(data=next_input, num_filter=channels_reduced, kernel=(1, 1), pad=(0, 0), no_bias=False, name=name + '_conv2') conv3 = mx.sym.Convolution(data=conv2, num_filter=channels, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv3') bn2 = get_norm_layer(data=conv3, norm_type=norm_type, name=name + '_bn2') act2 = get_act(data=bn2, act_type=act_type, name=name + '_act2') last_kernel = 3 conv4 = mx.sym.Convolution(data=act2, num_filter=channels, kernel=(last_kernel, last_kernel), pad=(last_kernel // 2, last_kernel // 2), num_group=channels, no_bias=False, name=name + '_conv4') sum_out = mx.sym.broadcast_add(conv4, data, name=name + '_add') return sum_out
def preact_residual_block(data, channels, name, kernel=3, act_type='relu'): """ Returns a residual block without any max pooling operation :param data: Input data :param channels: Number of filters for all CNN-layers :param name: Name for the residual block :param act_type: Activation function to use :return: """ bn1 = mx.sym.BatchNorm(data=data, name=name + '_bn1') conv1 = mx.sym.Convolution(data=bn1, num_filter=channels, kernel=(kernel, kernel), pad=(kernel // 2, kernel // 2), num_group=1, no_bias=True, name=name + '_conv1') bn2 = mx.sym.BatchNorm(data=conv1, name=name + '_bn2') act1 = get_act(data=bn2, act_type=act_type, name=name + '_act1') conv2 = mx.sym.Convolution(data=act1, num_filter=channels, kernel=(kernel, kernel), stride=(1, 1), pad=(kernel // 2, kernel // 2), no_bias=True, name=name + '_conv2') sum = mx.sym.broadcast_add(data, conv2, name=name + '_add') return sum
def residual_block(data, channels, name, kernel=3, act_type='relu', use_se=False): """ Returns a residual block without any max pooling operation :param data: Input data :param channels: Number of filters for all CNN-layers :param name: Name for the residual block :param act_type: Activation function to use :param use_se: If true, a squeeze excitation will be used :return: """ if use_se: se = channel_squeeze_excitation(data, channels, name=name + '_se', ratio=2, act_type=act_type) conv1 = mx.sym.Convolution(data=se, num_filter=channels, kernel=(kernel, kernel), pad=(kernel // 2, kernel // 2), num_group=1, no_bias=True, name=name + '_conv1') else: conv1 = mx.sym.Convolution(data=data, num_filter=channels, kernel=(kernel, kernel), pad=(kernel // 2, kernel // 2), num_group=1, no_bias=True, name=name + '_conv1') act1 = get_act(data=conv1, act_type=act_type, name=name + '_act1') bn1 = mx.sym.BatchNorm(data=act1, name=name + '_bn1') # kernel = 3 conv2 = mx.sym.Convolution(data=bn1, num_filter=channels, kernel=(kernel, kernel), stride=(1, 1), num_group=1, pad=(kernel // 2, kernel // 2), no_bias=True, name=name + '_conv2') bn2 = mx.sym.BatchNorm(data=conv2, name=name + '_bn2') sum = mx.sym.broadcast_add(data, bn2, name=name + '_add') return sum
def bottleneck_residual_block(data, channels, channels_operating, name, kernel=3, act_type='relu', use_se=False, data_variant=None): """ Returns a residual block without any max pooling operation :param data: Input data :param channels: Number of filters for all CNN-layers :param channels_operating: Number of filters used for 3x3, 5x5, 7x7,.. convolution :param name: Name for the residual block :param act_type: Activation function to use :param use_se: If true, a squeeze excitation will be used :param data_variant: Data input which holds the current active variant information :return: """ if data_variant is not None: first_input = mx.sym.Concat(*[data, data_variant], name=name + '_concat') add_channels = NB_CHANNELS_VARIANTS else: first_input = data add_channels = 0 if use_se: se = channel_squeeze_excitation(first_input, channels+add_channels, name=name + '_se', ratio=2) conv1 = mx.sym.Convolution(data=se, num_filter=channels_operating, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv1') else: conv1 = mx.sym.Convolution(data=first_input, num_filter=channels_operating, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv1') bn1 = mx.sym.BatchNorm(data=conv1, name=name + '_bn1') act1 = get_act(data=bn1, act_type=act_type, name=name + '_act1') conv2 = mx.sym.Convolution(data=act1, num_filter=channels_operating, kernel=(kernel, kernel), stride=(1, 1), num_group=channels_operating, pad=(kernel // 2, kernel // 2), no_bias=True, name=name + '_conv2') bn2 = mx.sym.BatchNorm(data=conv2, name=name + '_bn2') act2 = get_act(data=bn2, act_type=act_type, name=name + '_act2') conv3 = mx.sym.Convolution(data=act2, num_filter=channels, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv3') bn3 = mx.sym.BatchNorm(data=conv3, name=name + '_bn3') sum = mx.sym.broadcast_add(bn3, data, name=name+'_add') return sum
def preact_resnet_symbol(channels=256, channels_value_head=8, channels_policy_head=81, value_fc_size=256, value_kernelsize=7, res_blocks=19, act_type='relu', n_labels=4992, grad_scale_value=0.01, grad_scale_policy=0.99, select_policy_from_plane=True): """ Creates the alpha zero model symbol based on the given parameters. :param channels: Used for all convolution operations. (Except the last 2) :param workspace: Parameter for convolution :param value_fc_size: Fully Connected layer size. Used for the value output :param num_res_blocks: Number of residual blocks to stack. In the paper they used 19 or 39 residual blocks :param bn_mom: batch normalization momentum :param act_type: Activation function which will be used for all intermediate layers :param n_labels: Number of labels the for the policy :param grad_scale_value: Constant scalar which the gradient for the value outputs are being scaled width. (They used 1.0 for default and 0.01 in the supervised setting) :param grad_scale_policy: Constant scalar which the gradient for the policy outputs are being scaled width. (They used 1.0 for default and 0.99 in the supervised setting) :return: mxnet symbol of the model """ # get the input data data = mx.sym.Variable(name='data') body = get_stem(data=data, channels=channels, act_type=act_type) for idx in range(res_blocks): body = preact_residual_block(body, channels, name='res_block%d' % idx, kernel=3, act_type=act_type) body = mx.sym.BatchNorm(data=body, name='stem_bn1') body = get_act(data=body, act_type=act_type, name='stem_act1') # for policy output policy_out = policy_head(data=body, channels=channels, act_type=act_type, channels_policy_head=channels_policy_head, select_policy_from_plane=select_policy_from_plane, n_labels=n_labels, grad_scale_policy=grad_scale_policy, use_se=False, no_bias=True) # for value output value_out = value_head(data=body, channels_value_head=channels_value_head, value_kernelsize=1, act_type=act_type, value_fc_size=value_fc_size, grad_scale_value=grad_scale_value, use_se=False, use_mix_conv=False) # group value_out and policy_out together sym = mx.symbol.Group([value_out, policy_out]) return sym
def rise_mobile_v3_symbol(channels=256, channels_operating_init=128, channel_expansion=64, act_type='relu', channels_value_head=32, channels_policy_head=81, value_fc_size=128, dropout_rate=0.15, select_policy_from_plane=True, use_se=True, res_blocks=13, n_labels=4992): """ RISEv3 architecture :param channels: Main number of channels :param channels_operating_init: Initial number of channels at the start of the net for the depthwise convolution :param channel_expansion: Number of channels to add after each residual block :param act_type: Activation type to use :param channels_value_head: Number of channels for the value head :param value_fc_size: Number of units in the fully connected layer of the value head :param channels_policy_head: Number of channels for the policy head :param dropout_rate: Droput factor to use. If 0, no dropout will be applied. Value must be in [0,1] :param select_policy_from_plane: True, if policy head type shall be used :param use_se: Indicates if a squeeze excitation layer shall be used :param res_blocks: Number of residual blocks :param n_labels: Number of policy target labels (used for select_policy_from_plane=False) :return: symbol """ # get the input data data = mx.sym.Variable(name='data') data = get_stem(data=data, channels=channels, act_type=act_type) cur_channels = channels_operating_init kernels = [ [3], # 0 [3], # 1 [3, 5], # 2 [3, 5], # 3 [3, 5, 7, 9], # 4 [3, 5], # 5 [3, 5], # 6 [3, 5], # 7 [3, 5], # 8 [3, 5], # 9 [3, 5], # 10 [3, 5], # 11 [3, 5], # 12 ] for idx in range(res_blocks): cur_kernels = kernels[idx] if idx == 4 or idx >= 9: use_se = True else: use_se = False data = preact_residual_dmixconv_block(data=data, channels=channels, channels_operating=cur_channels, kernels=cur_kernels, name='dconv_%d' % idx, use_se=use_se) cur_channels += channel_expansion # return data data = mx.sym.BatchNorm(data=data, name='stem_bn1') data = get_act(data=data, act_type=act_type, name='stem_act1') if dropout_rate != 0: data = mx.sym.Dropout(data, p=dropout_rate) value_out = value_head(data=data, act_type=act_type, use_se=use_se, channels_value_head=channels_value_head, value_fc_size=value_fc_size, use_mix_conv=True) policy_out = policy_head(data=data, act_type=act_type, channels_policy_head=channels_policy_head, n_labels=n_labels, select_policy_from_plane=select_policy_from_plane, use_se=False, channels=channels) # group value_out and policy_out together sym = mx.symbol.Group([value_out, policy_out]) return sym
def preact_residual_dmixconv_block(data, channels, channels_operating, name, kernels=None, act_type='relu', se_ratio=4, se_type="se"): """ Returns a residual block without any max pooling operation :param data: Input data :param channels: Number of filters for all CNN-layers :param name: Name for the residual block :param act_type: Activation function to use :param se_ratio: Squeeze excitation ratio :param use_se: Boolean if a squeeze excitation module will be used :param se_type: Squeeze excitation module type. Available [None, "se", "cbam", "ca_se", "cm_se", "sa_se", "sm_se"] :return: symbol """ bn1 = mx.sym.BatchNorm(data=data, name=name + '_bn1') conv1 = mx.sym.Convolution(data=bn1, num_filter=channels_operating, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv1') bn2 = mx.sym.BatchNorm(data=conv1, name=name + '_bn2') act1 = get_act(data=bn2, act_type=act_type, name=name + '_act1') conv2 = mix_conv(data=act1, channels=channels_operating, kernels=kernels, name=name + 'conv2') bn3 = mx.sym.BatchNorm(data=conv2, name=name + '_bn3') out = get_act(data=bn3, act_type=act_type, name=name + '_act2') out = mx.sym.Convolution(data=out, num_filter=channels, kernel=(1, 1), pad=(0, 0), no_bias=True, name=name + '_conv3') if se_type is not None: if se_type == "se": out = channel_squeeze_excitation(out, channels, name=name + '_se', ratio=se_ratio, act_type=act_type, use_hard_sigmoid=True) elif se_type == "cbam": out = convolution_block_attention_module(out, channels, name=name + '_se', ratio=se_ratio, act_type=act_type, use_hard_sigmoid=True) elif se_type == "ca_se": out = ca_se(out, channels, name=name + '_ca_se', ratio=se_ratio, act_type=act_type, use_hard_sigmoid=True) elif se_type == "cm_se": out = cm_se(out, channels, name=name + '_cm_se', ratio=se_ratio, act_type=act_type, use_hard_sigmoid=True) elif se_type == "sa_se": out = sa_se(out, name=name + 'sa_se', use_hard_sigmoid=True) elif se_type == "sm_se": out = sm_se(out, name=name + 'sm_se', use_hard_sigmoid=True) else: raise Exception(f'Unsupported se_type "{se_type}"') out_sum = mx.sym.broadcast_add(data, out, name=name + '_add') return out_sum
def rise_mobile_v3_symbol(channels=256, channels_operating_init=128, channel_expansion=64, act_type='relu', channels_value_head=8, channels_policy_head=81, value_fc_size=256, dropout_rate=0.15, grad_scale_value=0.01, grad_scale_policy=0.99, select_policy_from_plane=True, kernels=None, n_labels=4992, se_ratio=4, se_types="se"): """ RISEv3 architecture :param channels: Main number of channels :param channels_operating_init: Initial number of channels at the start of the net for the depthwise convolution :param channel_expansion: Number of channels to add after each residual block :param act_type: Activation type to use :param channels_value_head: Number of channels for the value head :param value_fc_size: Number of units in the fully connected layer of the value head :param channels_policy_head: Number of channels for the policy head :param dropout_rate: Droput factor to use. If 0, no dropout will be applied. Value must be in [0,1] :param grad_scale_value: Constant scalar which the gradient for the value outputs are being scaled width. (0.01 is recommended for supervised learning with little data) :param grad_scale_policy: Constant scalar which the gradient for the policy outputs are being scaled width. :param select_policy_from_plane: True, if policy head type shall be used :param kernels: List of kernel sizes used for the residual blocks. The length of the list corresponds to the number of residual blocks. :param n_labels: Number of policy target labels (used for select_policy_from_plane=False) :param se_ratio: Reduction ration used in the squeeze excitation module :param se_types: List of squeeze exciation modules to use for each residual layer. The length of this list must be the same as len(kernels). Available types: - "se": Squeeze excitation block - Hu et al. - https://arxiv.org/abs/1709.01507 - "cbam": Convolutional Block Attention Module (CBAM) - Woo et al. - https://arxiv.org/pdf/1807.06521.pdf - "ca_se": Same as "se" - "cm_se": Squeeze excitation with max operator - "sa_se": Spatial excitation with average operator - "sm_se": Spatial excitation with max operator :return: symbol """ if len(kernels) != len(se_types): raise Exception( f'The length of "kernels": {len(kernels)} must be the same as' f' the length of "se_types": {len(se_types)}') valid_se_types = [None, "se", "cbam", "ca_se", "cm_se", "sa_se", "sm_se"] for se_type in se_types: if se_type not in valid_se_types: raise Exception( f"Unavailable se_type: {se_type}. Available se_types include {se_types}" ) # get the input data data = mx.sym.Variable(name='data') data = get_stem(data=data, channels=channels, act_type=act_type) if kernels is None: kernels = [3] * 13 cur_channels = channels_operating_init for idx, cur_kernels in enumerate(kernels): data = preact_residual_dmixconv_block(data=data, channels=channels, channels_operating=cur_channels, kernels=cur_kernels, name='dconv_%d' % idx, se_ratio=se_ratio, se_type=se_types[idx]) cur_channels += channel_expansion data = mx.sym.BatchNorm(data=data, name='stem_bn1') data = get_act(data=data, act_type=act_type, name='stem_act1') if dropout_rate != 0: data = mx.sym.Dropout(data, p=dropout_rate) value_out = value_head(data=data, act_type=act_type, use_se=False, channels_value_head=channels_value_head, value_fc_size=value_fc_size, use_mix_conv=False, grad_scale_value=grad_scale_value) policy_out = policy_head(data=data, act_type=act_type, channels_policy_head=channels_policy_head, n_labels=n_labels, select_policy_from_plane=select_policy_from_plane, use_se=False, channels=channels, grad_scale_policy=grad_scale_policy) # group value_out and policy_out together sym = mx.symbol.Group([value_out, policy_out]) return sym