def test_noise_injection_with_checkpointing(): from cntk import initializer shape = (100,100) w1 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w2 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) w3 = parameter(shape=shape, init=initializer.glorot_uniform(seed=123)) lr=learning_rate_schedule(0.5, UnitType.sample) m=C.momentum_schedule(0.99) learner1 = C.momentum_sgd([w1], lr, m, gaussian_noise_injection_std_dev=0.5) learner2 = C.momentum_sgd([w2], lr, m, gaussian_noise_injection_std_dev=0.5) learner3 = C.momentum_sgd([w3], lr, m, gaussian_noise_injection_std_dev=0.5) assert np.allclose(w1.value, w2.value) and np.allclose(w1.value, w3.value) for i in range(10): checkpoint = learner1.create_checkpoint() v = np.float32(np.random.rand(100,100)) learner1.update({w1: v}, 1) learner2.update({w2: v}, 1) assert not np.allclose(w1.value, w2.value) learner3.restore_from_checkpoint(checkpoint) learner3.update({w3: v}, 1) assert np.allclose(w1.value, w3.value)
def __call__(self, num_classes=10, act_type=relu, mdl_conv1a_nf=40, mdl_conv1b_nf=60, mdl_conv2a_nf=50, mdl_conv2b_nf=75, mdl_fc1_nh=75, mdl_drop2a_p=0.033, mdl_drop2b_p=0.097, mdl_drop3_p=0.412, **kwargs): input_var = input_variable((1, self.img_h, self.img_w), np.float32) label_var = input_variable((self.n_dim), np.float32) conv1a = Convolution(filter_shape=(3, 3), num_filters=int(mdl_conv1a_nf), activation=act_type, init=glorot_uniform(), pad=True, name='conv1a')(input_var) conv1b = Convolution(filter_shape=(3, 3), num_filters=int(mdl_conv1b_nf), activation=act_type, init=glorot_uniform(), pad=True, name='conv1b')(conv1a) pool1 = MaxPooling(filter_shape=(2, 2), strides=(2, 2), name='pool1')(conv1b) conv2a = Convolution(filter_shape=(3, 3), num_filters=int(mdl_conv2a_nf), activation=act_type, init=glorot_uniform(), pad=True, name='conv2a')(pool1) drop2a = Dropout(prob=mdl_drop2a_p, name="drop2a")(conv2a) conv2b = Convolution(filter_shape=(3, 3), num_filters=int(mdl_conv2b_nf), activation=act_type, init=glorot_uniform(), pad=True, name='conv2b')(drop2a) drop2b = Dropout(prob=mdl_drop2a_p, name="drop2a")(conv2b) pool2 = MaxPooling(filter_shape=(2, 2), strides=(2, 2), name='pool2')(drop2b) fc1 = Dense(shape=int(mdl_fc1_nh), init=glorot_uniform(), activation=act_type, name='fc1')(pool2) drop3 = Dropout(prob=mdl_drop3_p, name="drop3")(fc1) #fc2 = Dense(shape=num_classes, init=glorot_uniform(), activation=softmax, name='fc2')(drop3) fc2 = Dense(shape=num_classes, init=glorot_uniform(), activation=None, name='fc2')(drop3) return input_var, label_var, fc2
def create_basic_model(input, out_dims): convolutional_layer_1 = Convolution((5, 5), 16, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(input) pooling_layer_1 = MaxPooling((2, 2), strides=(1, 1))(convolutional_layer_1) convolutional_layer_2 = Convolution((5, 5), 16, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_1) pooling_layer_2 = MaxPooling((3, 3), strides=(2, 2))(convolutional_layer_2) # convolutional_layer_3 = Convolution((9, 9), 16, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_2) pooling_layer_3 = MaxPooling((3, 3), strides=(2, 2))(convolutional_layer_3) fully_connected_layer = Dense(256, init=glorot_uniform())(pooling_layer_3) dropout_layer = Dropout(0.5)(fully_connected_layer) output_layer = Dense(out_dims, init=glorot_uniform(), activation=None)(dropout_layer) return output_layer
def create_vgg9_model(input, num_classes): with default_options(activation=relu): model = Sequential([ LayerStack(3, lambda i: [ Convolution((3,3), [64,96,128][i], init=glorot_uniform(), pad=True), Convolution((3,3), [64,96,128][i], init=glorot_uniform(), pad=True), MaxPooling((3,3), strides=(2,2)) ]), LayerStack(2, lambda : [ Dense(1024, init=glorot_uniform()) ]), Dense(num_classes, init=glorot_uniform(), activation=None) ]) return model(input)
def frcn_predictor(features, rois, n_classes, model_path): # Load the pretrained classification net and find nodes loaded_model = load_model(model_path) feature_node = find_by_name(loaded_model, feature_node_name) conv_node = find_by_name(loaded_model, last_conv_node_name) pool_node = find_by_name(loaded_model, pool_node_name) last_node = find_by_name(loaded_model, last_hidden_node_name) # Clone the conv layers and the fully connected layers of the network conv_layers = combine([conv_node.owner]).clone(CloneMethod.freeze, {feature_node: placeholder()}) fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: placeholder()}) # Create the Fast R-CNN model feat_norm = features - Constant(114) conv_out = conv_layers(feat_norm) roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim)) fc_out = fc_layers(roi_out) # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported W = parameter(shape=(4096, n_classes), init=glorot_uniform()) b = parameter(shape=n_classes, init=0) z = times(fc_out, W) + b return z
def frcn_predictor(features, rois, n_classes, base_path): # model specific variables for AlexNet model_file = base_path + "/../../../resources/cntk/AlexNet.model" roi_dim = 6 feature_node_name = "features" last_conv_node_name = "conv5.y" pool_node_name = "pool3" last_hidden_node_name = "h2_d" # Load the pretrained classification net and find nodes print("Loading pre-trained model...") loaded_model = load_model(model_file) print("Loading pre-trained model... DONE.") feature_node = find_by_name(loaded_model, feature_node_name) conv_node = find_by_name(loaded_model, last_conv_node_name) pool_node = find_by_name(loaded_model, pool_node_name) last_node = find_by_name(loaded_model, last_hidden_node_name) # Clone the conv layers and the fully connected layers of the network conv_layers = combine([conv_node.owner]).clone(CloneMethod.freeze, {feature_node: placeholder()}) fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: placeholder()}) # Create the Fast R-CNN model feat_norm = features - constant(114) conv_out = conv_layers(feat_norm) roi_out = roipooling(conv_out, rois, (roi_dim, roi_dim)) fc_out = fc_layers(roi_out) #fc_out.set_name("fc_out") # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported W = parameter(shape=(4096, n_classes), init=glorot_uniform()) b = parameter(shape=n_classes, init=0) z = times(fc_out, W) + b return z, fc_out
def linear_layer(input_var, output_dim): times_param = parameter(shape=(list(input_var.shape) + [output_dim]), init=glorot_uniform()) bias_param = parameter(shape=(output_dim), init=0) t = times(input_var, times_param) return bias_param + t
def frcn_predictor(features, rois, n_classes, model_path): # Load the pretrained classification net and find nodes loaded_model = load_model(model_path) feature_node = find_by_name(loaded_model, feature_node_name) conv_node = find_by_name(loaded_model, last_conv_node_name) pool_node = find_by_name(loaded_model, pool_node_name) last_node = find_by_name(loaded_model, last_hidden_node_name) # Clone the conv layers and the fully connected layers of the network conv_layers = combine([conv_node.owner ]).clone(CloneMethod.freeze, {feature_node: placeholder()}) fc_layers = combine([last_node.owner]).clone(CloneMethod.clone, {pool_node: placeholder()}) # Create the Fast R-CNN model feat_norm = features - Constant(114) conv_out = conv_layers(feat_norm) roi_out = roipooling(conv_out, rois, C.MAX_POOLING, (roi_dim, roi_dim), 0.0625) fc_out = fc_layers(roi_out) # z = Dense(rois[0], num_classes, map_rank=1)(fc_out) # --> map_rank=1 is not yet supported W = parameter(shape=(4096, n_classes), init=glorot_uniform()) b = parameter(shape=n_classes, init=0) z = times(fc_out, W) + b return z
def LSTM(shape, cell_shape=None, activation=default_override_or(tanh), use_peepholes=default_override_or(False), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): activation = get_default_override(LSTM, activation=activation) use_peepholes = get_default_override(LSTM, use_peepholes=use_peepholes) init = get_default_override(LSTM, init=init) init_bias = get_default_override(LSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( LSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('LSTM', shape, cell_shape, activation=activation, use_peepholes=use_peepholes, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def GRU(shape, cell_shape=None, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): # (prev_h, x) -> (h) ''' GRU(shape, cell_shape=None, activation=tanh, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') Layer factory function to create a GRU block for use inside a recurrence. Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): activation (:class:`~cntk.ops.functions.Function`, defaults to tanh): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): name (str, defaults to ''): the name of the Function instance in the network Returns: cntk.ops.functions.Function: A function (prev_h, input) -> h) ''' activation = get_default_override(GRU, activation=activation) init = get_default_override(GRU, init=init) init_bias = get_default_override(GRU, init_bias=init_bias) enable_self_stabilization = get_default_override(GRU, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('GRU', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def RNNUnit(shape, cell_shape=None, activation=default_override_or(sigmoid), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): ''' RNNUnit(shape, cell_shape=None, activation=sigmoid, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') This is a deprecated name for :func:`~cntk.layers.blocks.RNNStep`. Use that name instead. ''' activation = get_default_override(RNNUnit, activation=activation) init = get_default_override(RNNUnit, init=init) init_bias = get_default_override(RNNUnit, init_bias=init_bias) enable_self_stabilization = get_default_override( RNNUnit, enable_self_stabilization=enable_self_stabilization) warnings.warn( 'This name will be removed in future versions. Please use ' 'RNNStep(...) instead, which is identical except for its name', DeprecationWarning) return _RecurrentBlock('RNNStep', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def attention_score(att_dim, init=glorot_uniform(), name=''): """ Compute the attention score, where each of the input will be projected to a new dimention space (att_dim) via Wi/Wm """ sim = project_cosine(att_dim, init, name=name + '_sim') return sequence.softmax(10 * sim, name=name)
def resnet_classifer(input, num_classes): conv_w_scale = 7.07 conv_b_value = 0 fc1_w_scale = 0.4 fc1_b_value = 0 sc_value = 1 bn_time_const = 4096 kernel_width = 3 kernel_height = 3 conv1_w_scale = 0.26 c_map1 = 16 conv1 = conv_bn_relu_layer(input, c_map1, kernel_width, kernel_height, 1, 1, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn1_1 = resnet_node2(conv1, c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn1_2 = resnet_node2(rn1_1, c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn1_3 = resnet_node2(rn1_2, c_map1, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) c_map2 = 32 rn2_1_wProj = get_projection_map(c_map2, c_map1) rn2_1 = resnet_node2_inc(rn1_3, c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn2_1_wProj) rn2_2 = resnet_node2(rn2_1, c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn2_3 = resnet_node2(rn2_2, c_map2, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) c_map3 = 64 rn3_1_wProj = get_projection_map(c_map3, c_map2) rn3_1 = resnet_node2_inc(rn2_3, c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const, rn3_1_wProj) rn3_2 = resnet_node2(rn3_1, c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) rn3_3 = resnet_node2(rn3_2, c_map3, kernel_width, kernel_height, conv1_w_scale, conv_b_value, sc_value, bn_time_const) # Global average pooling poolw = 8 poolh = 8 poolh_stride = 1 poolv_stride = 1 pool = pooling(rn3_3, AVG_POOLING, (1, poolh, poolw), (1, poolv_stride, poolh_stride)) out_times_params = parameter(shape=(c_map3, 1, 1, num_classes), init=glorot_uniform()) out_bias_params = parameter(shape=(num_classes), init=0) t = times(pool, out_times_params) return t + out_bias_params
def linear_layer(input_var, output_dim): input_dim = input_var.shape[0] times_param = parameter(shape=(input_dim, output_dim), init=glorot_uniform()) bias_param = parameter(shape=(output_dim), init=0) t = times(input_var, times_param) return bias_param + t
def WeightDroppedLSTM(shape, dropout_rate, cell_shape=None, activation=default_override_or(tanh), use_peepholes=default_override_or(False), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), seed=SentinelValueForAutoSelectRandomSeed, name=''): ''' WDLSTM(shape, cell_shape=None, activation=tanh, use_peepholes=False, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') Layer factory function to create an LSTM block for use inside a recurrence. The LSTM block implements one step of the recurrence and is stateless. It accepts the previous state as its first two arguments, and outputs its new state as a two-valued tuple ``(h,c)``. Example: >>> # a typical recurrent LSTM layer >>> from cntkx.layers import * >>> lstm_layer = Recurrence(WeightDroppedLSTM(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` use_peepholes (bool, defaults to `False`): init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent LSTM layer. ''' activation = get_default_override(WeightDroppedLSTM, activation=activation) use_peepholes = get_default_override(WeightDroppedLSTM, use_peepholes=use_peepholes) init = get_default_override(WeightDroppedLSTM, init=init) init_bias = get_default_override(WeightDroppedLSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( WeightDroppedLSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('WeightDroppedLSTM', shape, cell_shape, activation=activation, use_peepholes=use_peepholes, init=init, init_bias=init_bias, dropout_rate=dropout_rate, seed=seed, enable_self_stabilization=enable_self_stabilization, name=name)
def linear_layer(input_var, output_dim): shape = input_var.shape() input_dim = shape[0] times_param = parameter(shape=(input_dim, output_dim), init=glorot_uniform()) bias_param = parameter(shape=(output_dim), init=0) t = times(input_var, times_param) return bias_param + t
def dot_attention(self, inputs, memory, dim): ''' @inputs: [#,c][d] a sequence need attention @memory(key): [#,q][d] a sequence input refers to compute similarity(weight) @value: [#,q][d] a sequence input refers to weighted sum @output: [#,c][d] attention vector ''' input_ph = C.placeholder() input_mem = C.placeholder() with C.layers.default_options( bias=False, activation=C.relu): # all the projections have no bias attn_proj_enc = C.layers.Dense(dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( memory_, 0).outputs # [#][*=q, d], [#][*=q] unpack_memory_expand = C.sequence.broadcast_as(unpack_memory, inputs_) # [#,c][*=q,d] matrix = C.times_transpose(inputs_, unpack_memory_expand) / ( dim**0.5) # [#,c][*=q] mem_mask_expand = C.sequence.broadcast_as(mem_mask, inputs_) # [#,c][*=q] matrix = C.element_select(mem_mask_expand, matrix, C.constant(-1e+30)) # [#,c][*=q] logits = C.reshape(C.softmax(matrix), (-1, 1)) # [#,c][*=q,1] # [#,c][*=q, d] memory_expand = C.sequence.broadcast_as( C.sequence.unpack(input_mem, 0, no_mask_output=True), input_ph) weighted_att = C.reshape(C.reduce_sum(logits * memory_expand, axis=0), (-1, )) # [#,c][d] return C.as_block(C.combine(weighted_att, logits), [(input_ph, inputs), (input_mem, memory)], 'dot attention', 'dot attention')
def LSTMCell(x, y, dh, dc): '''LightLSTM Cell''' b = C.parameter(shape=(4 * cell_dim), init=0) W = C.parameter(shape=(input_dim, 4 * cell_dim), init=glorot_uniform()) H = C.parameter(shape=(cell_dim, 4 * cell_dim), init=glorot_uniform()) # projected contribution from input x, hidden, and bias proj4 = b + C.times(x, W) + C.times(dh, H) it_proj = C.slice(proj4, -1, 0 * cell_dim, 1 * cell_dim) bit_proj = C.slice(proj4, -1, 1 * cell_dim, 2 * cell_dim) ft_proj = C.slice(proj4, -1, 2 * cell_dim, 3 * cell_dim) ot_proj = C.slice(proj4, -1, 3 * cell_dim, 4 * cell_dim) it = C.sigmoid(it_proj) # input gate bit = it * C.tanh(bit_proj) ft = C.sigmoid(ft_proj) # forget gate bft = ft * dc ct = bft + bit ot = C.sigmoid(ot_proj) # output gate ht = ot * C.tanh(ct) # projected contribution from input y, hidden, and bias proj4_2 = b + C.times(y, W) + C.times(ht, H) it_proj_2 = C.slice(proj4_2, -1, 0 * cell_dim, 1 * cell_dim) bit_proj_2 = C.slice(proj4_2, -1, 1 * cell_dim, 2 * cell_dim) ft_proj_2 = C.slice(proj4_2, -1, 2 * cell_dim, 3 * cell_dim) ot_proj_2 = C.slice(proj4_2, -1, 3 * cell_dim, 4 * cell_dim) it_2 = C.sigmoid(it_proj_2) # input gate bit_2 = it_2 * C.tanh(bit_proj_2) ft_2 = C.sigmoid(ft_proj_2) # forget gate bft_2 = ft_2 * ct ct2 = bft_2 + bit_2 ot_2 = C.sigmoid(ot_proj_2) # output gate ht2 = ot_2 * C.tanh(ct2) return (ht, ct, ht2, ct2)
def IndyLSTM(shape, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): """ Implementation of Independently Recurrent Long Short-term Memory cells: IndyLSTMs by Gonnet and Deselaers. Paper can be found at https://arxiv.org/abs/1903.08023 IndyLSTM differ from regular LSTM cells in that the recurrent weights are not modeled as a full matrix, but as a diagonal matrix, i.e. the output and state of each LSTM cell depends on the inputs and its own output/state, as opposed to the input and the outputs/states of all the cells in the layer. The number of parameters per IndyLSTM layer, and thus the number of FLOPS per evaluation, is linear in the number of nodes in the layer, as opposed to quadratic for regular LSTM layers, resulting in potentially both smaller and faster model. Example: >>> # a gated recurrent layer >>> from cntkx.layers import * >>> indy_lstm_layer = Recurrence(IndyLSTM(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent IndyLSTM layer. """ activation = get_default_override(IndyLSTM, activation=activation) init = get_default_override(IndyLSTM, init=init) init_bias = get_default_override(IndyLSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( IndyLSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('IndyLSTM', shape, None, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed, enable_self_stabilization=enable_self_stabilization, name=name)
def IndRNN(shape, activation=default_override_or(relu), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): """ IndRNN implementation found in "Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN" by Li, et al (https://arxiv.org/abs/1803.04831). IndRNN are RNNS where neurons in each layer are independent from each other, and the cross-channel information is obtained through stacking multiple layers. It has been shown that an IndRNN can be easily regulated to prevent the gradient exploding and vanishing problems while allowing the networkto learn long-term dependencies. Moreover, an IndRNN can work with non-saturated activation functions such as relu (rectified linear unit) and be still trained robustly. Multiple IndRNNs can be stacked to construct a network that is deeper than the existing RNNs. Experimental results have shown that the proposed IndRNN is able to process very long sequences (over 5000 time steps), can be used to construct very deep networks (21 layers used in the experiment) and still be trained robustly. Better performances have been achieved on various tasks by using IndRNNs compared with the traditional RNN and LSTM. IndRNN also enables the usable of Relu activation which more efficient to compute than sigmoid and leads to faster convergence during training. You may consider to initialise the recurrent weights using a uniform distribution from 0 to 1. The original code is available at: https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne. Example: >>> # a plain relu RNN layer >>> from cntkx.layers import * >>> relu_rnn_layer = Recurrence(IndRNN(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer activation (:class:`~cntk.ops.functions.Function`, defaults to signmoid): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, input) -> h`` where ``h = activation(input @ W + prev_h * R + b)`` """ activation = get_default_override(IndRNN, activation=activation) init = get_default_override(IndRNN, init=init) init_bias = get_default_override(IndRNN, init_bias=init_bias) enable_self_stabilization = get_default_override(IndRNN, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('IndRNN', shape, None, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed, enable_self_stabilization=enable_self_stabilization, name=name)
def linear_layer(input_var, output_dim): try: shape = input_var.shape() except AttributeError: input_var = input_var.output() shape = input_var.shape() input_dim = shape[0] times_param = parameter(shape=(input_dim, output_dim), init=glorot_uniform()) bias_param = parameter(shape=(output_dim), init=0) t = times(input_var, times_param) return bias_param + t
def project_cosine_sim(att_dim, init=glorot_uniform(), name=''): """ Compute the project cosine similarity of two input sequences, where each of the input will be projected to a new dimention space (att_dim) via Wi/Wm """ Wi = Parameter(_INFERRED + tuple((att_dim, )), init=init, name='Wi') Wm = Parameter(_INFERRED + tuple((att_dim, )), init=init, name='Wm') status = placeholder_variable(name='status') memory = placeholder_variable(name='memory') projected_status = times(status, Wi, name='projected_status') projected_memory = times(memory, Wm, name='projected_memory') sim = cosine_similarity(projected_status, projected_memory, name=name + '_sim') return seq_softmax(sim, name=name)
def GRU(shape, cell_shape=None, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): ''' GRU(shape, cell_shape=None, activation=tanh, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') Layer factory function to create a GRU block for use inside a recurrence. The GRU block implements one step of the recurrence and is stateless. It accepts the previous state as its first argument, and outputs its new state. Example: >>> # a gated recurrent layer >>> from cntk.layers import * >>> gru_layer = Recurrence(GRU(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: cntk.ops.functions.Function: A function ``(prev_h, input) -> h`` that implements one step of a recurrent GRU layer. ''' activation = get_default_override(GRU, activation=activation) init = get_default_override(GRU, init=init) init_bias = get_default_override(GRU, init_bias=init_bias) enable_self_stabilization = get_default_override( GRU, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('GRU', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def project_cosine(project_dim, init = glorot_uniform(), name=''): """ Compute the project cosine similarity of two input sequences, where each of the input will be projected to a new dimention space (project_dim) via Wi/Wm """ Wi = Parameter(_INFERRED + (project_dim,), init = init, name='Wi') Wm = Parameter(_INFERRED + (project_dim,), init = init, name='Wm') status = placeholder(name='status') memory = placeholder(name='memory') projected_status = times(status, Wi, name = 'projected_status') projected_memory = times(memory, Wm, name = 'projected_memory') status_br = sequence.broadcast_as(projected_status, projected_memory, name='status_broadcast') sim = cosine_distance(status_br, projected_memory, name= name) return sim
def GRU(shape, cell_shape=None, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): # (prev_h, x) -> (h) ''' Layer factory function to create a GRU block for use inside a recurrence. Returns a function (prev_h, input) -> h). ''' activation = get_default_override(GRU, activation=activation) init = get_default_override(GRU, init=init) init_bias = get_default_override(GRU, init_bias=init_bias) enable_self_stabilization = get_default_override(GRU, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('GRU', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def conv_bn_layer(input, out_feature_map_count, kernel_width, kernel_height, h_stride, v_stride, w_scale, b_value, sc_value, bn_time_const): try: shape = input.shape() except AttributeError: input_var = input.output() shape = input_var.shape() num_in_channels = shape[0] #TODO: use RandomNormal to initialize, needs to be exposed in the python api conv_params = parameter(shape=(num_in_channels, kernel_height, kernel_width, out_feature_map_count), init=glorot_uniform(output_rank=-1, filter_rank=2)) conv_func = convolution(conv_params, input, (num_in_channels, v_stride, h_stride)) #TODO: initialize using b_value and sc_value, needs to be exposed in the python api bias_params = parameter(shape=(out_feature_map_count), init=b_value) scale_params = parameter(shape=(out_feature_map_count), init=sc_value) running_mean = constant((out_feature_map_count), 0.0) running_invstd = constant((out_feature_map_count), 0.0) return batch_normalization(conv_func, scale_params, bias_params, running_mean, running_invstd, True, bn_time_const, 0.0, 0.000000001)
def RNNUnit(shape, cell_shape=None, activation=default_override_or(sigmoid), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): # (prev_h, x) -> (h) ''' Layer factory function to create a plain RNN block for use inside a recurrence. Returns a function (prev_h, input) -> h): h = activation (W * input + R * prev_h + b) ''' activation = get_default_override(RNNUnit, activation=activation) init = get_default_override(RNNUnit, init=init) init_bias = get_default_override(RNNUnit, init_bias=init_bias) enable_self_stabilization = get_default_override(RNNUnit, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('RNNUnit', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def LSTM(shape, cell_shape=None, activation=default_override_or(tanh), use_peepholes=default_override_or(False), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): ''' Layer factory function to create an LSTM block for use inside a recurrence. Returns a function (prev_h, prev_c, input) -> h). ''' activation = get_default_override(RNNUnit, activation=activation) use_peepholes = get_default_override(LSTM, use_peepholes=use_peepholes) init = get_default_override(LSTM, init=init) init_bias = get_default_override(LSTM, init_bias=init_bias) enable_self_stabilization = get_default_override(LSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('LSTM', shape, cell_shape, activation=activation, use_peepholes=use_peepholes, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def gru_cell(shape, init=glorot_uniform(), name=''): # (x, (h,c)) """ GRU cell function """ shape = _as_tuple(shape) if len(shape) != 1: raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)") # determine stacking dimensions cell_shape_stacked = shape * 2 # patched dims with stack_axis duplicated 2 times # parameters Wz = Parameter(cell_shape_stacked, init=init, name='Wz') Wr = Parameter(cell_shape_stacked, init=init, name='Wr') Wh = Parameter(cell_shape_stacked, init=init, name='Wh') Uz = Parameter(_INFERRED + shape, init=init, name='Uz') Ur = Parameter(_INFERRED + shape, init=init, name='Ur') Uh = Parameter(_INFERRED + shape, init=init, name='Uh') def create_s_placeholder(): # we pass the known dimensions here, which makes dimension inference easier return Placeholder(shape=shape, name='S') # (h, c) # parameters to model function x = Placeholder(name='gru_block_arg') prev_status = create_s_placeholder() # formula of model function Sn_1 = prev_status z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'), name='z') r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'), name='r') h = tanh(times(x, Uh, name='x*Uh') + times(element_times(Sn_1, r, name='Sprev*r'), Wh), name='h') s = plus(element_times((1 - z), h, name='(1-z)*h'), element_times(z, Sn_1, name='z*SPrev'), name=name) apply_x_s = combine([s]) apply_x_s.create_placeholder = create_s_placeholder return apply_x_s
def create_shallow_model(input, out_dims): convolutional_layer_1_1 = Convolution((7,7), 32, init=glorot_uniform(), activation=relu, pad=True, strides=(1,1))(input) convolutional_layer_1_2 = Convolution((25,25), 32, init=glorot_uniform(), activation=relu, pad=True, strides=(1,1))(convolutional_layer_1_1) pooling_layer_1 = MaxPooling((25,25), strides=(5,5))(convolutional_layer_1_2 ) convolutional_layer_2_1 = Convolution((3,3), 32, init=glorot_uniform(), activation=relu, pad=True, strides=(1,1))(pooling_layer_1) pooling_layer_2 = MaxPooling((2,2), strides=(2,2))(convolutional_layer_2_1) fully_connected_layer_1 = Dense(512, init=glorot_uniform())(pooling_layer_2) fully_connected_layer_2 = Dense(128, init=glorot_uniform())(fully_connected_layer_1) dropout_layer = Dropout(0.5)(fully_connected_layer_2) output_layer = Dense(out_dims, init=glorot_uniform(), activation=None)(dropout_layer) return output_layer
def IndRNNStep(shape, cell_shape=None, activation=default_override_or(relu), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): activation = get_default_override(RNNStep, activation=activation) init = get_default_override(RNNStep, init=init) init_bias = get_default_override(RNNStep, init_bias=init_bias) enable_self_stabilization = get_default_override( RNNStep, enable_self_stabilization=enable_self_stabilization) return IndRNNBlock('RNNStep', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def create_model_ext(input, ext_values, out_dims): # in VGG style #https://www.cs.toronto.edu/~frossard/post/vgg16/ convolutional_layer_1_1 = Convolution((3, 3), 16, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(input) convolutional_layer_1_2 = Convolution( (5, 5), 32, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(convolutional_layer_1_1) pooling_layer_1 = MaxPooling((2, 2), strides=(2, 2))(convolutional_layer_1_2) convolutional_layer_2_1 = Convolution((3, 3), 32, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_1) convolutional_layer_2_2 = Convolution( (7, 7), 64, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(convolutional_layer_2_1) pooling_layer_2 = MaxPooling((2, 2), strides=(1, 1))(convolutional_layer_2_2) convolutional_layer_3_1 = Convolution((3, 3), 64, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_2) convolutional_layer_3_2 = Convolution( (7, 7), 96, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(convolutional_layer_3_1) pooling_layer_3 = MaxPooling((2, 2), strides=(1, 1))(convolutional_layer_3_2) convolutional_layer_4_1 = Convolution((3, 3), 96, init=glorot_uniform(), activation=relu, pad=True, strides=(1, 1))(pooling_layer_3) pooling_layer_4 = MaxPooling((2, 2), strides=(1, 1))(convolutional_layer_4_1) ## fully_connected_layer_1 = Dense(512, init=glorot_uniform())(pooling_layer_4) dropout_layer_1 = Dropout(0.5)(fully_connected_layer_1) fully_connected_with_extra_values = splice(dropout_layer_1, ext_values, axis=0) fully_connected_layer_2 = Dense( 256, init=glorot_uniform())(fully_connected_with_extra_values) fully_connected_layer_3 = Dense( 128, init=glorot_uniform())(fully_connected_layer_2) dropout_layer_2 = Dropout(0.5)(fully_connected_layer_3) output_layer = Dense(out_dims, init=glorot_uniform(), activation=None)(dropout_layer_2) return output_layer
def embedding(input, embedding_dim): input_dim = input.shape[0] embedding_parameters = parameter(shape=(input_dim, embedding_dim), init=glorot_uniform()) return times(input, embedding_parameters)
_trace_layers = False #_trace_layers = True # uncomment this to log creation of graph through layers _INFERRED = (InferredDimension,) # as a tuple, makes life easier # call this for all untested branches def UntestedBranchError(name): raise NotImplementedError("Untested code branch: " + name) # This record contains the defaults for a number of optional parameters to layers. # These can be overwritten temporarily by saying # with default_options(init=..., ...): # # code block within which the changed defaults are active _current_default_options = Record( init=glorot_uniform(), activation=None, # Dense() and Convolution() have no activation by default pad=False, # BUGBUG: not done for pooling at present. Need a special default? How to name? # ^^ This should be addressed by allowing configs per layer type. # To be addressed as a per-layer default. See default_options below. bias=True, init_bias=0, enable_self_stabilization=False, # Stabilizer() and LSTM() initial_state=None, # Recurrence() use_peepholes=False # LSTM() ) _default_sentinel = '(default)' # This is a singleton sentinel value we recognize and replace in _initializer_for() _default_sentinel_init = '(init default)' # use different ones for init andinit_bias so we can distinguish them in _initializer_for() _default_sentinel_init_bias = '(init_bias default)' # in function signatures we use symbols that indicate the default default in their name
def LSTMP_cell_with_self_stabilization(input, prev_output, prev_cell_state): input_dim = input.shape[0] output_dim = prev_output.shape[0] cell_dim = prev_cell_state.shape[0] Wxo = parameter(shape=(input_dim, cell_dim), init=glorot_uniform()) Wxi = parameter(shape=(input_dim, cell_dim), init=glorot_uniform()) Wxf = parameter(shape=(input_dim, cell_dim), init=glorot_uniform()) Wxc = parameter(shape=(input_dim, cell_dim), init=glorot_uniform()) Bo = parameter(shape=(cell_dim), init=0) Bc = parameter(shape=(cell_dim), init=0) Bi = parameter(shape=(cell_dim), init=0) Bf = parameter(shape=(cell_dim), init=0) Whi = parameter(shape=(output_dim, cell_dim), init=glorot_uniform()) Wci = parameter(shape=(cell_dim), init=glorot_uniform()) Whf = parameter(shape=(output_dim, cell_dim), init=glorot_uniform()) Wcf = parameter(shape=(cell_dim), init=glorot_uniform()) Who = parameter(shape=(output_dim, cell_dim), init=glorot_uniform()) Wco = parameter(shape=(cell_dim), init=glorot_uniform()) Whc = parameter(shape=(output_dim, cell_dim), init=glorot_uniform()) Wmr = parameter(shape=(cell_dim, output_dim), init=glorot_uniform()) # Stabilization by routing input through an extra scalar parameter sWxo = parameter(init=0) sWxi = parameter(init=0) sWxf = parameter(init=0) sWxc = parameter(init=0) sWhi = parameter(init=0) sWci = parameter(init=0) sWhf = parameter(init=0) sWcf = parameter(init=0) sWho = parameter(init=0) sWco = parameter(init=0) sWhc = parameter(init=0) sWmr = parameter(init=0) expsWxo = exp(sWxo) expsWxi = exp(sWxi) expsWxf = exp(sWxf) expsWxc = exp(sWxc) expsWhi = exp(sWhi) expsWci = exp(sWci) expsWhf = exp(sWhf) expsWcf = exp(sWcf) expsWho = exp(sWho) expsWco = exp(sWco) expsWhc = exp(sWhc) expsWmr = exp(sWmr) Wxix = times(element_times(expsWxi, input), Wxi) Whidh = times(element_times(expsWhi, prev_output), Whi) Wcidc = element_times(Wci, element_times(expsWci, prev_cell_state)) it = sigmoid(Wxix + Bi + Whidh + Wcidc) Wxcx = times(element_times(expsWxc, input), Wxc) Whcdh = times(element_times(expsWhc, prev_output), Whc) bit = element_times(it, tanh(Wxcx + Whcdh + Bc)) Wxfx = times(element_times(expsWxf, input), Wxf) Whfdh = times(element_times(expsWhf, prev_output), Whf) Wcfdc = element_times(Wcf, element_times(expsWcf, prev_cell_state)) ft = sigmoid(Wxfx + Bf + Whfdh + Wcfdc) bft = element_times(ft, prev_cell_state) ct = bft + bit Wxox = times(element_times(expsWxo, input), Wxo) Whodh = times(element_times(expsWho, prev_output), Who) Wcoct = element_times(Wco, element_times(expsWco, ct)) ot = sigmoid(Wxox + Bo + Whodh + Wcoct) mt = element_times(ot, tanh(ct)) return (times(element_times(expsWmr, mt), Wmr), ct)
def conv_bn_layer(input, out_feature_map_count, kernel_width, kernel_height, h_stride, v_stride, w_scale, b_value, sc_value, bn_time_const): shape = input.shape() num_in_channels = shape[0] #TODO: use RandomNormal to initialize, needs to be exposed in the python api conv_params = parameter(shape=(out_feature_map_count, num_in_channels, kernel_height, kernel_width), init=glorot_uniform(output_rank=-1, filter_rank=2)) conv_func = convolution(conv_params, input, (num_in_channels, v_stride, h_stride)) #TODO: initialize using b_value and sc_value, needs to be exposed in the python api bias_params = parameter(shape=(out_feature_map_count), init=b_value) scale_params = parameter(shape=(out_feature_map_count), init=sc_value) running_mean = constant((out_feature_map_count), 0.0) running_invstd = constant((out_feature_map_count), 0.0) return batch_normalization(conv_func, scale_params, bias_params, running_mean, running_invstd, True, bn_time_const, 0.0, 0.000000001)