def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def test_auto_broadcast_reconcile_issue(): x = C.sequence.input((3, ), name='x') y = C.input((3, ), name='y') y2 = C.reconcile_dynamic_axes(y, x) inputs = y2.owner.inputs # check does the reconcile_dynamic_axes call trigger the auto broadcast assert len(inputs) == 2 assert inputs[0].name == 'y' and inputs[1].name == 'x'
def test_auto_broadcast_reconcile_issue(): x = C.sequence.input((3,), name='x') y = C.input((3,), name='y') y2 = C.reconcile_dynamic_axes(y, x) inputs = y2.owner.inputs # check does the reconcile_dynamic_axes call trigger the auto broadcast assert len(inputs) == 2 assert inputs[0].name == 'y' and inputs[1].name == 'x'
def broadcast_xy(input_vec, h, w): """ broadcast input vector of length d to tensor (d x h x w) """ assert(h > 0 and w > 0) d = input_vec.shape[0] # reshape vector to d x 1 x 1 x = C.reshape(input_vec, (d, 1, 1)) # create a zeros-like tensor of size (d x h x w) t = np.zeros((d, h, w), dtype=np.float32) y = C.constant(t) z = C.reconcile_dynamic_axes(y, x) z = z + x return z
def test_to_sequence_backprop(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable(input_vocab_size, is_sparse=True, name='features') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, name='labels') ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[0, 1, 1], [0, 1, 0], [1, 0, 0]] seq2_data = [[0, 0, 1], [0, 1, 1]] seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads_1, loss_result_1 = ce.grad({x_seq_input : [_to_csr(seq1_data), _to_csr(seq2_data)], label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) # Create a clone of the model that uses a non-sequence input # and converts it to a sequence using to_sequence x_non_seq_input = C.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='non_seq_features') x_seq_lens = C.input_variable((), name='sequence_lengths') x_seq = C.to_sequence(x_non_seq_input, x_seq_lens) x_seq = C.reconcile_dynamic_axes(C.times(x_seq, np.eye(input_vocab_size, dtype=np.float32)), label_seq_input) ce_clone = ce.clone('share', {x_seq_input : x_seq}) x_non_seq_data = C.NDArrayView.from_csr(_to_csr([seq1_data, seq2_data + [[0, 0, 0]]]), shape=(2, 3, 3)) x_seq_lens_data = np.asarray([3, 2], dtype=np.float32) x_non_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'non_seq_features') label_seq_input = next(argument for argument in ce_clone.arguments if argument.name == 'labels') x_seq_lens = next(argument for argument in ce_clone.arguments if argument.name == 'sequence_lengths') param_grads_2, loss_result_2 = ce_clone.grad({x_non_seq_input : x_non_seq_data, x_seq_lens : x_seq_lens_data, label_seq_input : label_seq_data}, wrt=ce_clone.parameters, outputs=[ce_clone], as_numpy=False) assert np.array_equal(loss_result_1.as_sequences()[0], loss_result_2.as_sequences()[0]) assert np.array_equal(loss_result_1.as_sequences()[1], loss_result_2.as_sequences()[1]) for param in param_grads_1: if not param_grads_1[param].is_sparse: reference_grad_value = param_grads_1[param].asarray() grad_value = param_grads_2[param].asarray() assert np.array_equal(reference_grad_value, grad_value)
def create_train_model(s2smodel, embed_layer): ''' return: @input map @softmax @loss ''' q = C.Axis.new_unique_dynamic_axis('q') a = C.Axis.new_unique_dynamic_axis('a') b = C.Axis.default_batch_axis() qwk = C.sequence.input_variable(myConfig['wg_dim'], sequence_axis=q, is_sparse=False, name='qwk') qwn = C.sequence.input_variable(myConfig['wn_dim'], sequence_axis=q, is_sparse=False, name='qwn') awk = C.sequence.input_variable(myConfig['wg_dim'], sequence_axis=a, is_sparse=False, name='awk') awn = C.sequence.input_variable(myConfig['wn_dim'], sequence_axis=a, is_sparse=False, name='awn') input_ph = {'qwk': qwk, 'qwn': qwn, 'awk': awk, 'awn': awn} a_processed = embed_layer(awk, awn) q_processed = embed_layer(qwk, qwn) a_onehot = C.splice(awk, awn) print("q_onehot shape:{}".format(a_onehot.output)) # query generate answer logits = s2smodel(a_processed, q_processed) logits = C.sequence.slice(logits, 0, -1) print('logits shape:{}'.format(logits.output)) labels = C.sequence.slice(a_onehot, 1, 0) # <s> a b c </s> -> a b c </s> print('labels shape:{}'.format(labels.output)) logits = C.reconcile_dynamic_axes(logits, labels) loss = C.cross_entropy_with_softmax(logits, labels) errs = C.classification_error(logits, labels) return input_ph, logits, C.combine(loss, errs)
def inner(a): # reconcile_dynamic_axes is necessary to avoid subtle bugs e.g. sequence.where and one_hot return C.expand_dims(C.reconcile_dynamic_axes( C.sequence.where(C.sequence.broadcast_as(1, a)), a), axis=-1)
def hierarchical_softmax_layer_for_sequence(input_var, num_output_classes, target_class, target_output_in_class, batch_size, w1, b1, w2s, b2s): ''' A two layers hierarchical softmax function with sequence axis input: Example: >>> input_dim = 2 >>> num_output_classes = 4 >>> minibatch_size = 3 >>> seq_size = 5 >>> n_classes = int(math.ceil(math.sqrt(num_output_classes))) >>> n_outputs_per_class = n_classes >>> w1 = C.parameter(shape=(input_dim, n_classes), init=C.glorot_normal(seed=2), name='w1') >>> b1 = C.parameter(shape=(n_classes), init=C.glorot_normal(seed=3), name='b1') >>> w2s = C.parameter(shape=(n_classes, input_dim, n_outputs_per_class), init=C.glorot_normal(seed=4), name='w2s') >>> b2s = C.parameter(shape=(n_classes, n_outputs_per_class), init=C.glorot_normal(seed=5), name='b2s') # neural network structure for hierarchical softmax >>> h_input = C.sequence.input_variable(input_dim) >>> h_target_class = C.sequence.input_variable([1]) >>> h_target_output_in_class = C.sequence.input_variable([1]) >>> h_z, class_probs, all_probs = hierarchical_softmax_layer_for_sequence(h_input, num_output_classes, h_target_class, h_target_output_in_class, minibatch_size, w1, b1, w2s, b2s) >>> a = np.reshape(np.arange(seq_size * minibatch_size * input_dim, dtype = np.float32), (seq_size, minibatch_size, input_dim)) >>> labels = np.reshape(np.arange(seq_size * minibatch_size, dtype = np.float32), (seq_size, minibatch_size, 1)) % num_output_classes >>> target_labels = labels // n_outputs_per_class >>> target_output_in_labels = labels % n_outputs_per_class >>> h_z.eval({h_input: a, h_target_class: target_labels, h_target_output_in_class: target_output_in_labels})[1] array([[ 0.000859], [ 0. ], [ 0. ]], dtype=float32) Args: input_var: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis num_output_classes: int target_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis target_output_in_class: class:`~cntk.ops.functions.Function` that outputs a tensor with sequence axis and batch axis batch_size: int w1: C.parameter b1: C.parameter w2s: C.parameter b2s: C.parameter Returns: output_prob: class:`~cntk.ops.functions.Function` class_probs: class:`~cntk.ops.functions.Function` all_probs: a list of class:`~cntk.ops.functions.Function` ''' input_dim = input_var.shape[0] n_classes = int(math.ceil(math.sqrt(num_output_classes))) n_outputs_per_class = n_classes class_probs = C.softmax(b1 + C.times(input_var, w1)) w2_temp = C.gather(w2s, target_class) w2 = reshape(w2_temp, (input_dim, n_outputs_per_class)) w2 = C.sequence.broadcast_as(w2, input_var) b2 = reshape(C.gather(b2s, target_class), (n_outputs_per_class)) b2 = C.sequence.broadcast_as(b2, input_var) times_result = times(input_var, w2) probs_in_class = softmax(b2 + times_result) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) target_output_in_class = C.one_hot(target_output_in_class, n_outputs_per_class, False) probs_in_class = C.sequence.broadcast_as(probs_in_class, target_output_in_class) prob_in_class = C.times_transpose(probs_in_class, target_output_in_class) target_class = C.one_hot(target_class, n_classes, False) class_probs = C.sequence.broadcast_as(class_probs, target_class) class_prob = C.times_transpose(class_probs, target_class) output_prob = C.element_times(class_prob, prob_in_class) # this is for calculating all the outputs' probabilities all_probs = [] for i in range(n_classes): ci = C.constant(i) w2a = C.reshape(C.gather(w2s, ci), (input_dim, n_outputs_per_class)) w2a = C.sequence.broadcast_as(w2a, input_var) b2a = C.reshape(C.gather(b2s, ci), (n_outputs_per_class)) b2a = C.sequence.broadcast_as(b2a, input_var) probs_in_classa = C.softmax(b2a + times(input_var, w2a)) cia = C.constant(i, shape=[1]) cia = C.reconcile_dynamic_axes(cia, class_probs) cia = C.one_hot(cia, n_outputs_per_class, False) class_proba = C.times_transpose(class_probs, cia) class_proba = C.sequence.broadcast_as(class_proba, probs_in_classa) output_proba = C.element_times(class_proba, probs_in_classa) all_probs.append(output_proba) return output_prob, class_probs, all_probs
def inner(a): # reconcile_dynamic_axes is necessary to avoid subtle bugs e.g. sequence.where and one_hot return C.reconcile_dynamic_axes(C.sequence.where(C.ones_like(Cx.scalar(a))), a)
def call(self, x, mask=None): # if hasattr(x, '_keras_shape'): # input_shape = x._keras_shape # elif hasattr(K, 'int_shape'): # input_shape = K.int_shape(x) # layer_width = input_shape[self.waxis] # # layer_height = input_shape[self.haxis] # data_length = self.data_length # # img_height = self.img_size[1] # # define prior boxes shapes # box_widths = [] # # box_heights = [] # for ar in self.aspect_ratios: # if ar == 1 and len(box_widths) == 0: # box_widths.append(self.min_width) # # box_heights.append(self.min_width) # elif ar == 1 and len(box_widths) > 0: # box_widths.append(np.sqrt(self.min_width * self.max_width)) # # box_heights.append(np.sqrt(self.min_width * self.max_width)) # elif ar != 1: # box_widths.append(self.min_width * np.sqrt(ar)) # # box_heights.append(self.min_size / np.sqrt(ar)) # box_widths = 0.5 * np.array(box_widths, dtype='float32') # # box_heights = 0.5 * np.array(box_heights) # # define centers of prior boxes # step_x = data_length / layer_width # レイヤー上の1ポイントがカバーするオリジナル画像上のピクセル数(layer_width=19, img_width=300ならstep_x=15.78) # # step_y = img_height / layer_height # linx = np.linspace(0.5 * step_x, data_length - 0.5 * step_x, # layer_width, dtype='float32') # img_width=300, layer_width=19 なら0-300の区間を19に分けた時のピクセル中心位置の数列(7.89, 23,68, ..., 292.105) # # liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y, layer_height) # # centers_x = np.array(linx) # # centers_x, centers_y = np.meshgrid(linx, liny) # # centers_x = centers_x.reshape(-1, 1) # # centers_y = centers_y.reshape(-1, 1) # # define xmin, ymin, xmax, ymax of prior boxes # num_priors_ = len(self.aspect_ratios) # # prior_boxes = np.concatenate((centers_x, centers_y), axis=1) # prior_boxes = linx.reshape(-1,1) # prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_)) # 「1, 」が必要かどうかはよくわからない…1ならなくても結果は同じ?それとも次元が一つ増える? # prior_boxes[:, ::2] -= box_widths # # prior_boxes[:, 1::4] -= box_heights # prior_boxes[:, 1::2] += box_widths # # prior_boxes[:, 3::4] += box_heights # prior_boxes[:, :] /= data_length # # prior_boxes[:, 1::2] /= img_height # prior_boxes = prior_boxes.reshape(-1, 2) # if self.clip: # prior_boxのxmin, ymin, xmax, ymaxは0-1でクリップしておく # prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0) # # define variances # num_boxes = len(prior_boxes) # if len(self.variances) == 1: # variances = np.ones((num_boxes, 2)) * self.variances[0] # elif len(self.variances) == 2: # variances = np.tile(self.variances, (num_boxes, 1)) # ここでvalianceを作る # else: # raise Exception('Must provide one or two variances.') # prior_boxes = np.concatenate((prior_boxes, variances), axis=1) # 作ったvalianceをconcatenateする shape: (priorboxのサイズ, 2+2) """priorsを保存する""" # filename = 'mschrom_unet_priors_d10.pkl' # temp_priors = [] # if os.path.exists(filename): # with open(filename, mode='rb') as f: # temp_priors = pickle.load(f) # if len(temp_priors) != 0: # temp_priors = np.concatenate((temp_priors, prior_boxes), axis=0) # else: # temp_priors = prior_boxes # with open(filename, mode='wb') as f: # pickle.dump(temp_priors, f) """ここまで""" prior_boxes_tensor = K.expand_dims( K.variable(self.prior_boxes), 0 ) # バックエンドテンソルに変換(1次元追加)shape:TensorShape([Dimension(1), Dimension(54), Dimension(8)]) if K.backend() == 'tensorflow': pattern = [tf.shape(x)[0], 1, 1] # patternのshapeは(none, 1, 1)的な感じ。tf.shape(x)[0]はバッチ数 prior_boxes_tensor = K.tile( prior_boxes_tensor, pattern ) # TensorShape([Dimension(None), Dimension(54), Dimension(8)]) これはバッチ数だけタイルされた形(バッチ数はNoneで予約) elif K.backend() == 'cntk': #init_parameter = C.parameter(shape=K.shape(prior_boxes), init=prior_boxes) # batch_axis = C.Axis.default_batch_axis() # input_dynamic_axes = [batch_axis] prior_boxes_constants = C.Constant(self.prior_boxes) prior_boxes_constants2 = C.reconcile_dynamic_axes( prior_boxes_constants, dynamic_axes_as=x) # ph = C.ops.placeholder(K.shape(prior_boxes), dynamic_axes=C.Axis.default_batch_axis()) # zeros = C.zeros_like(x) #prior_boxes_tensor = C.plus(zeros, prior_boxes) prior_boxes_tensor = prior_boxes_constants2 #a = C.variables.Variable(K.shape(prior_boxes_tensor), dynamic_axes=C.Axis.default_batch_axis()) # prior_boxes_tensor = C.Constant(prior_boxes) # pattern = [C.axis.Axis.default_dynamic_axis(), 1,1] # prior_boxes_tensor = K.tile(prior_boxes_tensor, pattern) # TensorShape([Dimension(None), Dimension(54), Dimension(8)]) これはバッチ数だけタイルされた形(バッチ数はNoneで予約) #prior_boxes_tensor = K.variable(prior_boxes) # TensorShape([Dimension(None), Dimension(54), Dimension(8)]) これはバッチ数だけタイルされた形(バッチ数はNoneで予約) elif K.backend() == 'theano': #TODO pass return prior_boxes_tensor
def attention_layer(self, context, query, dim): input_ph = C.placeholder(shape=(dim, )) input_mem = C.placeholder(shape=(dim, )) with C.layers.default_options(bias=False, activation=C.relu): attn_proj_enc = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] cln_mem_ph = C.placeholder() # [#,q][?=d] cln_inp_ph = C.placeholder() # [#,c][?=d] unpack_inputs, inputs_mask = C.sequence.unpack( cln_inp_ph, 0).outputs # [#][*=c,d] [#][*=c] expand_inputs = C.sequence.broadcast_as(unpack_inputs, cln_mem_ph) # [#,q][*=c,d] matrix = C.reshape( C.times_transpose(cln_mem_ph, expand_inputs) / (self.hidden_dim**0.5), (-1, )) # [#,q][*=c] matrix = C.element_select( C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix, C.constant(-1e30)) logits = C.softmax(matrix, axis=0, name='level 1 weight') # [#,q][*=c] trans_expand_inputs = C.transpose(expand_inputs, [1, 0]) # [#,q][d,*=c] q_over_c = C.reshape( C.reduce_sum(logits * trans_expand_inputs, axis=1), (-1, )) / (self.hidden_dim**0.5) # [#,q][d] new_q = C.splice(cln_mem_ph, q_over_c) # [#,q][2*d] # over unpack_matrix, matrix_mask = C.sequence.unpack( matrix, 0).outputs # [#][*=q,*=c] [#][*=q] inputs_mask_s = C.to_sequence(C.reshape(inputs_mask, (-1, 1))) # [#,c'][1] trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]), inputs_mask_s) # [#,c'][*=q] trans_matrix = C.sequence.gather(trans_matrix, inputs_mask_s) # [#,c2][*=q] trans_matrix = C.element_select( C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix, C.constant(-1e30)) logits2 = C.softmax(trans_matrix, axis=0, name='level 2 weight') # [#,c2][*=c] unpack_new_q, new_q_mask = C.sequence.unpack( new_q, 0).outputs # [#][*=q,2*d] [#][*=q] expand_new_q = C.transpose( C.sequence.broadcast_as(unpack_new_q, trans_matrix), [1, 0]) # [#,c2][2d,*=q] c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1), (-1, )) / (2 * self.hidden_dim)**0.5 # [#,c2][2d] c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph) weighted_q = c_over_q.clone(C.CloneMethod.share, { cln_mem_ph: memory_, cln_inp_ph: inputs_ }) # [#,c][2d] c2c = q_over_c.clone(C.CloneMethod.share, { cln_mem_ph: inputs_, cln_inp_ph: inputs_ }) # [#,c][2d] att_context = C.splice(input_ph, weighted_q, c2c) # 2d+2d+2d return C.as_block(att_context, [(input_ph, context), (input_mem, query)], 'attention_layer', 'attention_layer')