def test_calc_gradient(self): x = layers.create_parameter(dtype="float32", shape=[5, 10]) y = layers.create_parameter(dtype="float32", shape=[10, 8]) mul_out = layers.mul(x=x, y=y) mean_out = layers.mean(mul_out) a = calc_gradient(mean_out, mul_out) b = calc_gradient(mean_out, x) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
def _build_decoder(self, enc_last_hidden, enc_last_cell, mode='train'): softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) if mode == 'train': #fluid.layers.Print(self.tar_emb) #fluid.layers.Print(enc_last_hidden) #fluid.layers.Print(enc_last_cell) dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \ self.hidden_size, num_layers=self.num_layers, \ batch_first=self.batch_first, \ dropout_prob=self.dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) )) dec_output = layers.matmul(dec_output, softmax_weight) return dec_output else: print("mode not supprt", mode)
def fully_con_layer(self, x, n, channel, name): """Fully connected layer""" # bt_init = fluid.initializer.ConstantInitializer(value=0.01) bt_init = fluid.initializer.TruncatedNormal(loc=0.0, scale=2.0) bt = fl.create_parameter( shape=[n, 1], dtype="float32", attr=fluid.ParamAttr(name="%s_bt" % name, trainable=True, initializer=bt_init), ) x_conv = fl.conv2d(input=x, num_filters=1, filter_size=[1, 1], stride=[1, 1], padding="SAME", data_format="NHWC", param_attr=fluid.ParamAttr(name="%s_conv2d" % name)) x_conv = x_conv + bt return x_conv
def _calc_bow_logits(self, enc_out, bow_pos): """Get the logits of generation.""" bow_feat = layers.slice(input=enc_out, axes=[1], starts=[0], ends=[1]) bow_feat = layers.reshape(x=bow_feat, shape=[-1, self.hidden_size]) bow_pos = layers.cast(x=bow_pos, dtype="int32") bow_feat = layers.gather(input=bow_feat, index=bow_pos) bow_trans_feat = layers.fc( input=bow_feat, size=self.emb_size, act=self.hidden_act, param_attr=fluid.ParamAttr(name="bow_trans_fc.w_0", initializer=self.param_initializer), bias_attr=fluid.ParamAttr(name="bow_trans_fc.b_0")) bow_trans_feat = pre_process_layer(bow_trans_feat, self.post_cls_cmd, name="bow_trans") if self.weight_sharing: fc_out = layers.matmul( x=bow_trans_feat, y=fluid.default_main_program().global_block().var( self.token_emb_name), transpose_y=True) if self.cls_bias: fc_out += layers.create_parameter( shape=[self.vocab_size], dtype=self.dtype, attr=fluid.ParamAttr(name="bow_out_fc.b_0"), is_bias=True) else: bow_out_bias_attr = fluid.ParamAttr( name="bow_out_fc.b_0") if self.cls_bias else False fc_out = layers.fc(input=bow_trans_feat, size=self.vocab_size, param_attr=fluid.ParamAttr( name="bow_out_fc.w_0", initializer=self.param_initializer), bias_attr=bow_out_bias_attr) return fc_out
def _calc_logits(self, enc_out, checkpoints=None, seq_pos=None): """Get the logits of generation.""" enc_out = layers.reshape(x=enc_out, shape=[-1, self.hidden_size]) if seq_pos is not None: seq_pos = layers.cast(x=seq_pos, dtype="int32") seq_feat = layers.gather(input=enc_out, index=seq_pos) else: seq_feat = enc_out seq_trans_feat = layers.fc( input=seq_feat, size=self.emb_size, act=self.hidden_act, param_attr=fluid.ParamAttr(name="mask_lm_trans_fc.w_0", initializer=self.param_initializer), bias_attr=fluid.ParamAttr(name="mask_lm_trans_fc.b_0")) seq_trans_feat = pre_process_layer(seq_trans_feat, self.post_cls_cmd, name="mask_lm_trans") if checkpoints is not None: checkpoints.append(seq_trans_feat) if self.weight_sharing: fc_out = layers.matmul( x=seq_trans_feat, y=fluid.default_main_program().global_block().var(self.token_emb_name), transpose_y=True) if self.cls_bias: fc_out += layers.create_parameter( shape=[self.vocab_size], dtype=self.dtype, attr=fluid.ParamAttr(name="mask_lm_out_fc.b_0"), is_bias=True) else: seq_out_bias_attr = fluid.ParamAttr(name="mask_lm_out_fc.b_0") if self.cls_bias else False fc_out = layers.fc( input=seq_trans_feat, size=self.vocab_size, param_attr=fluid.ParamAttr(name="mask_lm_out_fc.w_0", initializer=self.param_initializer), bias_attr=seq_out_bias_attr) return fc_out
def __init__(self, cfg, name=None): super(ErnieModelForPretraining, self).__init__(cfg, name=name) initializer = F.initializer.TruncatedNormal( scale=cfg['initializer_range']) d_model = cfg['hidden_size'] d_vocab = cfg['vocab_size'] self.pooler_heads = D.LayerList([NSPHead(cfg, name=name)]) self.mlm = _build_linear(d_model, d_model, append_name(name, 'mask_lm_trans_fc'), initializer, act=cfg['hidden_act']) self.mlm_ln = _build_ln(d_model, name=append_name(name, 'mask_lm_trans')) self.mlm_bias = L.create_parameter( dtype='float32', shape=[d_vocab], attr=F.ParamAttr(name=append_name(name, 'mask_lm_out_fc.b_0'), initializer=F.initializer.Constant(value=0.0)), is_bias=True, )
def gin_layer(gw, node_features, edge_features, train_eps, name): def send_func(src_feat, dst_feat, edge_feat): """Send""" return src_feat["h"] + edge_feat["h"] epsilon = L.create_parameter( shape=[1, 1], dtype="float32", attr=F.ParamAttr(name="%s_eps" % name), default_initializer=F.initializer.ConstantInitializer(value=0.0)) if not train_eps: epsilon.stop_gradient = True msg = gw.send(send_func, nfeat_list=[("h", node_features)], efeat_list=[("h", edge_features)]) node_feat = gw.recv(msg, "sum") + node_features * (epsilon + 1.0) # if apply_func is not None: # node_feat = apply_func(node_feat, name) return node_feat
def _create_mask_variables(cls, main_program, startup_program, params): r""" Create sparse mask Tensors according to supported layers in :attr:`main_program`. This function is called in second step of `ASPHelper._minimize` Args: main_program (Program): Program with model definition and its parameters. startup_program (Program): Program for initializing parameters. params (list): Variable parameters. """ asp_info = cls._get_program_asp_info(main_program) with program_guard(main_program, startup_program): for param in params: if ASPHelper._is_supported_layer(main_program, param.name): if param.name not in asp_info.mask_vars: mask_param = layers.create_parameter( name=ASPHelper._get_mask_name(param.name), shape=param.shape, dtype=param.dtype, default_initializer=ConstantInitializer(value=1.0)) mask_param.stop_gradient = True mask_param.trainable = False asp_info.update_mask_vars(param.name, mask_param)
def __init__(self, cfg, name=None): cfg['return_additional_info'] = True cfg['has_pooler'] = False super(ErnieModelForGeneration, self).__init__(cfg, name=name) initializer = F.initializer.TruncatedNormal( scale=cfg['initializer_range']) d_model = cfg['hidden_size'] d_vocab = cfg['vocab_size'] self.mlm = _build_linear(d_model, d_model, append_name(name, 'mask_lm_trans_fc'), initializer, act=cfg['hidden_act']) self.mlm_ln = _build_ln(d_model, name=append_name(name, 'mask_lm_trans')) self.mlm_bias = L.create_parameter( dtype='float32', shape=[d_vocab], attr=F.ParamAttr(name=append_name(name, 'mask_lm_out_fc.b_0'), initializer=F.initializer.Constant(value=0.0)), is_bias=True, )
def gcn_layer(gw, feature, edge_features, act, name): """tbd""" def send_func(src_feat, dst_feat, edge_feat): """tbd""" return src_feat["h"] + edge_feat["h"] size = feature.shape[-1] msg = gw.send(send_func, nfeat_list=[("h", feature)], efeat_list=[("h", edge_features)]) output = gw.recv(msg, mean_recv) output = layers.fc(output, size=size, bias_attr=False, param_attr=fluid.ParamAttr(name=name)) bias = layers.create_parameter(shape=[size], dtype='float32', is_bias=True, name=name + '_bias') output = layers.elementwise_add(output, bias, act=act) return output
def network(batch_size, items_num, hidden_size, step, rate): stdv = 1.0 / math.sqrt(hidden_size) items = layers.data( name="items", shape=[batch_size, -1, 1], dtype="int64", append_batch_size=False) #[bs, uniq_max, 1] seq_index = layers.data( name="seq_index", shape=[batch_size, -1], dtype="int64", append_batch_size=False) #[-1(seq_max)*batch_size, 1] last_index = layers.data( name="last_index", shape=[batch_size], dtype="int64", append_batch_size=False) #[batch_size, 1] adj_in = layers.data( name="adj_in", shape=[batch_size, -1, -1], dtype="float32", append_batch_size=False) adj_out = layers.data( name="adj_out", shape=[batch_size, -1, -1], dtype="float32", append_batch_size=False) mask = layers.data( name="mask", shape=[batch_size, -1, 1], dtype="float32", append_batch_size=False) label = layers.data( name="label", shape=[batch_size, 1], dtype="int64", append_batch_size=False) items_emb = layers.embedding( input=items, is_sparse=True, param_attr=fluid.ParamAttr( name="emb", learning_rate=rate, initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[batch_size, uniq_max, h] data_feed = [items, seq_index, last_index, adj_in, adj_out, mask, label] pre_state = items_emb for i in range(step): pre_state = layers.reshape( x=pre_state, shape=[batch_size, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_adj_in = layers.matmul(adj_in, state_in) #[batch_size, uniq_max, h] state_adj_out = layers.matmul(adj_out, state_out) #[batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit( input=gru_fc, hidden=layers.reshape( x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = pre_state seq_index = layers.reshape(seq_index, shape=[-1]) seq = layers.gather(final_state, seq_index) #[batch_size*-1(seq_max), h] last = layers.gather(final_state, last_index) #[batch_size, h] seq = layers.reshape( seq, shape=[batch_size, -1, hidden_size]) #[batch_size, -1(seq_max), h] last = layers.reshape( last, shape=[batch_size, hidden_size]) #[batch_size, h] seq_fc = layers.fc( input=seq, name="seq_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, -1(seq_max), h] last_fc = layers.fc(input=last, name="last_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[bathc_size, h] seq_fc_t = layers.transpose( seq_fc, perm=[1, 0, 2]) #[-1(seq_max), batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) #[-1(seq_max), batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) #[h] add = layers.elementwise_add(add, b) #[-1(seq_max), batch_size, h] add_sigmoid = layers.sigmoid(add) #[-1(seq_max), batch_size, h] add_sigmoid = layers.transpose( add_sigmoid, perm=[1, 0, 2]) #[batch_size, -1(seq_max), h] weight = layers.fc(input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, -1, 1] weight *= mask weight_mask = layers.elementwise_mul(seq, weight, axis=0) global_attention = layers.reduce_sum(weight_mask, dim=1) final_attention = layers.concat( [global_attention, last], axis=1) #[batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="fina_attention_fc", size=hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, h] all_vocab = layers.create_global_var( shape=[items_num - 1, 1], value=0, dtype="int64", persistable=True, name="all_vocab") all_emb = layers.embedding( input=all_vocab, is_sparse=True, param_attr=fluid.ParamAttr( name="emb", learning_rate=rate, initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[all_vocab, h] logits = layers.matmul( x=final_attention_fc, y=all_emb, transpose_y=True) #[batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( logits=logits, label=label) #[batch_size, 1] loss = layers.reduce_mean(softmax) # [1] #fluid.layers.Print(loss) acc = layers.accuracy(input=logits, label=label, k=20) return loss, acc, data_feed, [items_emb, all_emb]
def __init__( self, dlatent_size, # Disentangled latent (W) dimensionality. resolution=1024, # Output resolution (1024 x 1024 by default). fmap_base=8192, # Overall multiplier for the number of feature maps. num_channels=3, # Number of output color channels. structure='fixed', # 'fixed' = no progressive growing, 'linear' = human-readable, 'recursive' = efficient, 'auto' = select automatically. fmap_max=512, # Maximum number of feature maps in any layer. fmap_decay=1.0, # log2 feature map reduction when doubling the resolution. f=None, # (Huge overload, if you dont have enough resouces, please pass it as `f = None`)Low-pass filter to apply when resampling activations. None = no filtering. use_pixel_norm=False, # Enable pixelwise feature vector normalization? use_instance_norm=True, # Enable instance normalization? use_wscale=True, # Enable equalized learning rate? use_noise=True, # Enable noise inputs? use_style=True # Enable style inputs? ): # batch size. """ synthesis of generator, the second part of gnerator parameters: dlatent_size: 512 Disentangled latent(W) dimensionality. resolution: 1024 x 1024. fmap_base: num_channels: structure: only support 'fixed' mode. fmap_max: """ super(G_synthesis, self).__init__() self.nf = lambda stage: min( int(fmap_base / (2.0**(stage * fmap_decay))), fmap_max) self.structure = structure # - 2 means we start from feature map with height and width equals 4. # as this example, we get num_layers = 18. self.resolution_log2 = int(np.log2(resolution)) num_layers = self.resolution_log2 * 2 - 2 self.num_layers = num_layers # Noise inputs. self.noise_inputs = [] for layer_idx in range(num_layers): #2~18 res = layer_idx // 2 + 2 shape = [1, 1, 2**res, 2**res] self.noise_inputs.append(layers.randn(shape)) # Blur2d self.blur = Blur2d(f) # torgb: fixed mode # channel 16 -> channel 8 self.channel_shrinkage = Conv2d(self.nf(self.resolution_log2 - 2), self.nf(self.resolution_log2), 3, use_wscale=use_wscale) # channel 8 -> channel 3 self.torgb = Conv2d(self.nf(self.resolution_log2), num_channels, 1, gain=1, use_wscale=use_wscale) # Initial Input Block self.const_input = layers.create_parameter( (1, self.nf(1), 4, 4), 'float32', default_initializer=fluid.initializer.ConstantInitializer( value=1.0)) self.bias = layers.create_parameter( (self.nf(1), ), 'float32', default_initializer=fluid.initializer.ConstantInitializer( value=1.0)) self.adaIn1 = LayerEpilogue(self.nf(1), dlatent_size, use_wscale, use_noise, use_pixel_norm, use_instance_norm, use_style) self.conv1 = Conv2d(self.nf(1), self.nf(1), 3, gain=1, use_wscale=use_wscale) self.adaIn2 = LayerEpilogue(self.nf(1), dlatent_size, use_wscale, use_noise, use_pixel_norm, use_instance_norm, use_style) # Common Block # 4 x 4 -> 8 x 8 res = 3 self.GBlock1 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs) # 8 x 8 -> 16 x 16 res = 4 self.GBlock2 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs) # 16 x 16 -> 32 x 32 res = 5 self.GBlock3 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs) # 32 x 32 -> 64 x 64 res = 6 self.GBlock4 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs) # 64 x 64 -> 128 x 128 res = 7 self.GBlock5 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs) # 128 x 128 -> 256 x 256 res = 8 self.GBlock6 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs) # 256 x 256 -> 512 x 512 res = 9 self.GBlock7 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs) # 512 x 512 -> 1024 x 1024 res = 10 self.GBlock8 = GBlock(res, use_wscale, use_noise, use_pixel_norm, use_instance_norm, self.noise_inputs)
def point_network_decoder(p_vec, q_vec, hidden_size, args): """Output layer - pointer network""" tag = 'pn_decoder_' init_random = fluid.initializer.Normal(loc=0.0, scale=1.0) random_attn = layers.create_parameter( shape=[1, hidden_size], dtype='float32', default_initializer=init_random) random_attn = layers.fc( input=random_attn, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_b')) random_attn = layers.reshape(random_attn, shape=[-1]) U = layers.fc(input=q_vec, param_attr=fluid.ParamAttr(name=tag + 'q_vec_fc_w'), bias_attr=False, size=hidden_size, act=None) + random_attn U = layers.tanh(U) logits = layers.fc(input=U, param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'), size=1, act=None) scores = layers.sequence_softmax(input=logits) pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0) pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum') init_state = layers.fc( input=pooled_vec, param_attr=fluid.ParamAttr(name=tag + 'init_state_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'init_state_fc_b'), size=hidden_size, act=None) def custom_dynamic_rnn(p_vec, init_state, hidden_size, para_name, args): tag = para_name + "custom_dynamic_rnn_" def static_rnn(step, p_vec=p_vec, init_state=None, para_name='', args=args): tag = para_name + "static_rnn_" ctx = layers.fc( input=p_vec, param_attr=fluid.ParamAttr(name=tag + 'context_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'context_fc_b'), size=hidden_size, act=None) beta = [] c_prev = init_state m_prev = init_state for i in range(step): m_prev0 = layers.fc( input=m_prev, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_b')) m_prev1 = layers.sequence_expand(x=m_prev0, y=ctx) Fk = ctx + m_prev1 Fk = layers.tanh(Fk) logits = layers.fc( input=Fk, size=1, act=None, param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b')) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=p_vec, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step( attn_ctx, hidden_t_prev=m_prev, cell_t_prev=c_prev, size=hidden_size, para_name=tag, args=args) m_prev = hidden_t c_prev = cell_t beta.append(scores) return beta return static_rnn( 2, p_vec=p_vec, init_state=init_state, para_name=para_name) fw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "fw_", args) bw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "bw_", args) start_prob = layers.elementwise_add( x=fw_outputs[0], y=bw_outputs[1], axis=0) / 2 end_prob = layers.elementwise_add( x=fw_outputs[1], y=bw_outputs[0], axis=0) / 2 return start_prob, end_prob
def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell
def gat_layer(gw, feature, edge_features, hidden_size, act, name, num_heads=1, feat_drop=0.1, attn_drop=0.1, is_test=False): """tbd""" def send_attention(src_feat, dst_feat, edge_feat): """tbd""" output = src_feat["left_a"] + dst_feat["right_a"] output = layers.leaky_relu(output, alpha=0.2) # (num_edges, num_heads) return {"alpha": output, "h": src_feat["h"] + edge_feat["h"]} def reduce_attention(msg): """tbd""" alpha = msg["alpha"] # lod-tensor (batch_size, seq_len, num_heads) h = msg["h"] alpha = paddle_helper.sequence_softmax(alpha) old_h = h h = layers.reshape(h, [-1, num_heads, hidden_size]) alpha = layers.reshape(alpha, [-1, num_heads, 1]) if attn_drop > 1e-15: alpha = layers.dropout(alpha, dropout_prob=attn_drop, is_test=is_test, dropout_implementation="upscale_in_train") h = h * alpha h = layers.reshape(h, [-1, num_heads * hidden_size]) h = layers.lod_reset(h, old_h) return layers.sequence_pool(h, "sum") if feat_drop > 1e-15: feature = layers.dropout(feature, dropout_prob=feat_drop, is_test=is_test, dropout_implementation='upscale_in_train') ft = layers.fc(feature, hidden_size * num_heads, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_weight')) left_a = layers.create_parameter(shape=[num_heads, hidden_size], dtype='float32', name=name + '_gat_l_A') right_a = layers.create_parameter(shape=[num_heads, hidden_size], dtype='float32', name=name + '_gat_r_A') reshape_ft = layers.reshape(ft, [-1, num_heads, hidden_size]) left_a_value = layers.reduce_sum(reshape_ft * left_a, -1) right_a_value = layers.reduce_sum(reshape_ft * right_a, -1) msg = gw.send(send_attention, nfeat_list=[("h", ft), ("left_a", left_a_value), ("right_a", right_a_value)], efeat_list=[("h", edge_features)]) output = gw.recv(msg, reduce_attention) bias = layers.create_parameter(shape=[hidden_size * num_heads], dtype='float32', is_bias=True, name=name + '_bias') bias.stop_gradient = True output = layers.elementwise_add(output, bias, act=act) return output
def decode(self, dec_input, enc_words_output, enc_sents_output, caches=None, gather_idx=None): """Decoding to generate output text""" trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, \ trg_src_sents_attn_bias, graph_attn_bias = dec_input dec_res = self._gen_dec_input(trg_word, trg_pos, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias) emb_out, trg_slf_attn_bias, trg_src_words_attn_bias, trg_src_sents_attn_bias, graph_attn_bias = \ dec_res.emb_out, dec_res.trg_slf_attn_bias, dec_res.trg_src_words_attn_bias, \ dec_res.trg_src_sents_attn_bias, dec_res.graph_attn_bias # (batch_size, tgt_len, emb_dim) dec_output = graph_decoder( dec_input=emb_out, # (batch_size, tgt_len, emb_dim) enc_words_output= enc_words_output, # (batch_size, n_blocks, n_tokens, emb_dim) enc_sents_output=enc_sents_output, # (batch_size, n_blocks, emb_dim) dec_slf_attn_bias= trg_slf_attn_bias, # (batch_size, n_head, tgt_len, tgt_len) dec_enc_words_attn_bias= trg_src_words_attn_bias, # (batch_size, n_blocks, n_head, tgt_len, n_tokens) dec_enc_sents_attn_bias= trg_src_sents_attn_bias, # (batch_size, n_head, tgt_len, n_blocks) graph_attn_bias= graph_attn_bias, # (batch_size, n_head, n_blocks, n_blocks) pos_win=self.pos_win, n_layer=self._dec_n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=self._prepostprocess_dropout, hidden_act=self._hidden_act, preprocess_cmd=self._preprocess_command, postprocess_cmd=self._postprocess_command, param_initializer=self._param_initializer, caches=caches, gather_idx=gather_idx, name='graph_decoder') # Reshape to 2D tensor to use GEMM instead of BatchedGEMM # (batch_size*tgt_len, emb_dim) dec_output = layers.reshape(dec_output, shape=[-1, self._emb_size], inplace=True) if self._dtype is "float16": dec_output = fluid.layers.cast(x=dec_output, dtype=self._emb_dtype) if self._weight_sharing: out = layers.matmul( x=dec_output, y=fluid.default_main_program().global_block().var( self._word_emb_name), transpose_y=True) bias = layers.create_parameter( shape=[self.voc_size], dtype=self._emb_dtype, attr=fluid.ParamAttr( name='generator.bias', initializer=fluid.initializer.Constant(value=0.0)), is_bias=True) predict = layers.elementwise_add(x=out, y=bias, axis=-1) else: predict = layers.fc( input=dec_output, size=self.voc_size, param_attr=fluid.ParamAttr( name="generator.w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name='generator.bias', initializer=fluid.initializer.Constant(value=0.0))) return predict
def lm_model(hidden_size, vocab_size, batch_size, num_layers=2, num_steps=20, init_scale=0.1, dropout=None, rnn_model='static', use_py_reader=False): def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) #i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) i = layers.slice(gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice(gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice(gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice(gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) #real_res = layers.concat(res, 0) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice(m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice(c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) ''' else: real_res = rnnout[-1] for i in range( num_layers ): m1, c1, m2, c2 = rnnout real_res = m2 m1.stop_gradient = True c1.stop_gradient = True c2.stop_gradient = True ''' #layers.Print( first_hidden, message="22", summarize=10) #layers.Print( rnnout[1], message="11", summarize=10) #real_res = ( rnnout[1] + rnnout[2] + rnnout[3] + rnnout[4]) / 4.0 real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) ''' last_hidden = layers.concat( hidden_array, 1 ) last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size]) last_hidden = layers.transpose( x = last_hidden, perm = [1, 0, 2]) last_cell = layers.concat( cell_array, 1) last_cell = layers.reshape( last_cell, shape=[ -1, num_layers, hidden_size]) last_cell = layers.transpose( x = last_cell, perm = [1, 0, 2]) ''' return real_res, last_hidden, last_cell def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter([hidden_size * 2, hidden_size*4], dtype="float32", name="fc_weight1_"+str(i), \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split(input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) try: from paddle.fluid.contrib.layers import fused_elemwise_activation # layers.sigmoid(i) * layers.tanh(j) tmp0 = fused_elemwise_activation( x=layers.tanh(j), y=i, functor_list=['elementwise_mul', 'sigmoid']) # pre_cell * layers.sigmoid(f) tmp1 = fused_elemwise_activation( x=pre_cell, y=f, functor_list=['elementwise_mul', 'sigmoid']) c = tmp0 + tmp1 # layers.tanh(c) * layers.sigmoid(o) m = fused_elemwise_activation( x=layers.tanh(c), y=o, functor_list=['elementwise_mul', 'sigmoid']) except ImportError: c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape(last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape(last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape(real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell batch_size_each = batch_size // fluid.core.get_cuda_device_count() if use_py_reader: feed_shapes = [[batch_size_each, num_steps, 1], [batch_size_each * num_steps, 1]] py_reader = fluid.layers.py_reader(capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64']) x, y = fluid.layers.read_file(py_reader) else: x = layers.data(name="x", shape=[batch_size_each, num_steps, 1], dtype='int64', append_batch_size=False) y = layers.data(name="y", shape=[batch_size_each * num_steps, 1], dtype='int64', append_batch_size=False) init_hidden = layers.data(name="init_hidden", shape=[num_layers, batch_size_each, hidden_size], dtype='float32', append_batch_size=False) init_cell = layers.data(name="init_cell", shape=[num_layers, batch_size_each, hidden_size], dtype='float32', append_batch_size=False) init_cell.persistable = True init_hidden.persistable = True init_hidden = layers.reshape(init_hidden, shape=[num_layers, -1, hidden_size]) init_cell = layers.reshape(init_cell, shape=[num_layers, -1, hidden_size]) x_emb = layers.embedding( input=x, size=[vocab_size, hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale))) x_emb = layers.reshape(x_emb, shape=[-1, num_steps, hidden_size], inplace=True) if dropout != None and dropout > 0.0: x_emb = layers.dropout(x_emb, dropout_prob=dropout, dropout_implementation='upscale_in_train') if rnn_model == "padding": rnn_out, last_hidden, last_cell = padding_rnn(x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell) elif rnn_model == "static": rnn_out, last_hidden, last_cell = encoder_static( x_emb, len=num_steps, init_hidden=init_hidden, init_cell=init_cell) elif rnn_model == "cudnn": x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden, init_cell, num_steps, hidden_size, num_layers, \ is_bidirec=False, \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ) rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2]) else: print("type not support") return rnn_out = layers.reshape(rnn_out, shape=[-1, num_steps, hidden_size], inplace=True) softmax_weight = layers.create_parameter([hidden_size, vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias', \ default_initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale)) projection = layers.matmul(rnn_out, softmax_weight) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size], inplace=True) loss = layers.softmax_with_cross_entropy(logits=projection, label=y, soft_label=False) loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True) loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) loss.persistable = True last_cell.persistable = True last_hidden.persistable = True layers.assign(input=last_cell, output=init_cell) layers.assign(input=last_hidden, output=init_hidden) feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] if use_py_reader: return loss, last_hidden, last_cell, feeding_list, py_reader else: return loss, last_hidden, last_cell, feeding_list
def __init__(self, batch_size, channels=1, bottleneck=32, params=[0, 1, 1, 1, 1], n_iter=20, last=False, bn=True): super(FlowLayer, self).__init__() self.batch_size = batch_size self.bottleneck = Conv2D(channels, bottleneck, stride=1, padding=0, filter_size=1, bias_attr=fluid.ParamAttr(trainable=False)) self.unbottleneck = Conv2D(bottleneck * 2, channels, stride=1, padding=(1, 1), filter_size=(3, 3), bias_attr=fluid.ParamAttr(trainable=False)) self.bn = BatchNorm(channels) if bn else None channels = bottleneck self.conv4Ix = Conv2D( channels, channels, padding=0, stride=1, filter_size=3, param_attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([[[[-0.5, 0, 0.5]]] * channels] * channels)), trainable=params[0] == 1), bias_attr=fluid.ParamAttr(trainable=False), groups=1) self.conv4Iy = Conv2D( channels, channels, padding=0, stride=1, filter_size=3, param_attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([[[[-0.5], [0], [0.5]]] * channels] * channels)), trainable=params[0] == 1), bias_attr=fluid.ParamAttr(trainable=False), groups=1) self.conv4px = Conv2D( channels, channels, padding=0, stride=1, filter_size=(1, 2), param_attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([[[[-1, 1]]] * channels] * channels)), trainable=params[1] == 1), bias_attr=fluid.ParamAttr(trainable=False), groups=1) self.conv4py = Conv2D( channels, channels, padding=0, stride=1, filter_size=(2, 1), param_attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([[[[-1], [1]]] * channels] * channels)), trainable=params[1] == 1), bias_attr=fluid.ParamAttr(trainable=False), groups=1) self.conv4u = Conv2D( channels, channels, padding=0, stride=1, filter_size=(1, 2), param_attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([[[[-1, 1]]] * channels] * channels)), trainable=params[1] == 1), bias_attr=fluid.ParamAttr(trainable=False), groups=1) self.conv4v = Conv2D( channels, channels, padding=0, stride=1, filter_size=(2, 1), param_attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([[[[-1], [1]]] * channels] * channels)), trainable=params[1] == 1), bias_attr=fluid.ParamAttr(trainable=False), groups=1) self.n_iter = n_iter self.channels = channels self.theta = layers.create_parameter( shape=[1], dtype='float32', attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([0.3])), trainable=params[2] == 1)) self.lamda = layers.create_parameter( shape=[1], dtype='float32', attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([0.15])), trainable=params[3] == 1)) self.tau = layers.create_parameter( shape=[1], dtype='float32', attr=fluid.ParamAttr( learning_rate=0.01, initializer=fluid.initializer.NumpyArrayInitializer( np.array([0.25])), trainable=params[4] == 1))
def network(items_num, hidden_size, step, bs): stdv = 1.0 / math.sqrt(hidden_size) items = fluid.data(name="items", shape=[bs, -1], dtype="int64") #[batch_size, uniq_max] seq_index = fluid.data(name="seq_index", shape=[bs, -1, 2], dtype="int32") #[batch_size, seq_max, 2] last_index = fluid.data(name="last_index", shape=[bs, 2], dtype="int32") #[batch_size, 2] adj_in = fluid.data(name="adj_in", shape=[bs, -1, -1], dtype="float32") #[batch_size, seq_max, seq_max] adj_out = fluid.data(name="adj_out", shape=[bs, -1, -1], dtype="float32") #[batch_size, seq_max, seq_max] mask = fluid.data(name="mask", shape=[bs, -1, 1], dtype="float32") #[batch_size, seq_max, 1] label = fluid.data(name="label", shape=[bs, 1], dtype="int64") #[batch_size, 1] datas = [items, seq_index, last_index, adj_in, adj_out, mask, label] py_reader = fluid.io.DataLoader.from_generator(capacity=256, feed_list=datas, iterable=False) feed_datas = datas items_emb = fluid.embedding( input=items, param_attr=fluid.ParamAttr(name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[batch_size, uniq_max, h] pre_state = items_emb for i in range(step): pre_state = layers.reshape(x=pre_state, shape=[bs, -1, hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform(low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, uniq_max, h] state_adj_in = layers.matmul(adj_in, state_in) #[batch_size, uniq_max, h] state_adj_out = layers.matmul(adj_out, state_out) #[batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape(x=gru_input, shape=[-1, hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit(input=gru_fc, hidden=layers.reshape( x=pre_state, shape=[-1, hidden_size]), size=3 * hidden_size) final_state = layers.reshape(pre_state, shape=[bs, -1, hidden_size]) seq = layers.gather_nd(final_state, seq_index) last = layers.gather_nd(final_state, last_index) seq_fc = layers.fc( input=seq, name="seq_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, seq_max, h] last_fc = layers.fc( input=last, name="last_fc", size=hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[bathc_size, h] seq_fc_t = layers.transpose(seq_fc, perm=[1, 0, 2]) #[seq_max, batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) #[seq_max, batch_size, h] b = layers.create_parameter( shape=[hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) #[h] add = layers.elementwise_add(add, b) #[seq_max, batch_size, h] add_sigmoid = layers.sigmoid(add) #[seq_max, batch_size, h] add_sigmoid = layers.transpose(add_sigmoid, perm=[1, 0, 2]) #[batch_size, seq_max, h] weight = layers.fc( input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, seq_max, 1] weight *= mask weight_mask = layers.elementwise_mul(seq, weight, axis=0) #[batch_size, seq_max, h] global_attention = layers.reduce_sum(weight_mask, dim=1) #[batch_size, h] final_attention = layers.concat([global_attention, last], axis=1) #[batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", size=hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) #[batch_size, h] all_vocab = layers.create_global_var(shape=[items_num - 1], value=0, dtype="int64", persistable=True, name="all_vocab") all_emb = fluid.embedding(input=all_vocab, param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[items_num, hidden_size]) #[all_vocab, h] logits = layers.matmul(x=final_attention_fc, y=all_emb, transpose_y=True) #[batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy(logits=logits, label=label) #[batch_size, 1] loss = layers.reduce_mean(softmax) # [1] acc = layers.accuracy(input=logits, label=label, k=50) return loss, acc, py_reader, feed_datas, logits
def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i = layers.slice( gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice( gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice( gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice( gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice( m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice( c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) return real_res, last_hidden, last_cell
def encoder(x, y, vocab_size, emb_size, init_hidden=None, init_cell=None, para_name='', custom_samples=None, custom_probabilities=None, test_mode=False, args=None): x_emb = layers.embedding(input=x, size=[vocab_size, emb_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr(name='embedding_para')) rnn_input = x_emb rnn_outs = [] rnn_outs_ori = [] cells = [] projs = [] for i in range(args.num_layers): rnn_input = dropout(rnn_input, test_mode, args) if init_hidden and init_cell: h0 = layers.squeeze(layers.slice(init_hidden, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) c0 = layers.squeeze(layers.slice(init_cell, axes=[0], starts=[i], ends=[i + 1]), axes=[0]) else: h0 = c0 = None rnn_out, cell, input_proj = lstmp_encoder( rnn_input, args.hidden_size, h0, c0, para_name + 'layer{}'.format(i + 1), emb_size, test_mode, args) rnn_out_ori = rnn_out if i > 0: rnn_out = rnn_out + rnn_input rnn_out = dropout(rnn_out, test_mode, args) cell = dropout(cell, test_mode, args) rnn_outs.append(rnn_out) rnn_outs_ori.append(rnn_out_ori) rnn_input = rnn_out cells.append(cell) projs.append(input_proj) softmax_weight = layers.create_parameter([vocab_size, emb_size], dtype="float32", name="softmax_weight") softmax_bias = layers.create_parameter([vocab_size], dtype="float32", name='softmax_bias') projection = layers.matmul(rnn_outs[-1], softmax_weight, transpose_y=True) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape(projection, shape=[-1, vocab_size]) if args.sample_softmax and (not test_mode): loss = layers.sampled_softmax_with_cross_entropy( logits=projection, label=y, num_samples=args.n_negative_samples_batch, seed=args.random_seed) else: label = layers.one_hot(input=y, depth=vocab_size) loss = layers.softmax_with_cross_entropy(logits=projection, label=label, soft_label=True) return [x_emb, projection, loss], rnn_outs, rnn_outs_ori, cells, projs
def gat(gw, feature, hidden_size, activation, name, num_heads=8, feat_drop=0.6, attn_drop=0.6, is_test=False): """Implementation of graph attention networks (GAT) This is an implementation of the paper GRAPH ATTENTION NETWORKS (https://arxiv.org/abs/1710.10903). Args: gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). hidden_size: The hidden size for gat. activation: The activation for the output. name: Gat layer names. num_heads: The head number in gat. feat_drop: Dropout rate for feature. attn_drop: Dropout rate for attention. is_test: Whether in test phrase. Return: A tensor with shape (num_nodes, hidden_size * num_heads) """ def send_attention(src_feat, dst_feat, edge_feat): output = src_feat["left_a"] + dst_feat["right_a"] output = L.leaky_relu(output, alpha=0.2) # (num_edges, num_heads) return {"alpha": output, "h": src_feat["h"]} def reduce_attention(msg): alpha = msg["alpha"] # lod-tensor (batch_size, seq_len, num_heads) h = msg["h"] alpha = paddle_helper.sequence_softmax(alpha) old_h = h h = L.reshape(h, [-1, num_heads, hidden_size]) alpha = L.reshape(alpha, [-1, num_heads, 1]) if attn_drop > 1e-15: alpha = L.dropout(alpha, dropout_prob=attn_drop, is_test=is_test, dropout_implementation="upscale_in_train") h = h * alpha h = L.reshape(h, [-1, num_heads * hidden_size]) h = L.lod_reset(h, old_h) return L.sequence_pool(h, "sum") if feat_drop > 1e-15: feature = L.dropout(feature, dropout_prob=feat_drop, is_test=is_test, dropout_implementation='upscale_in_train') ft = L.fc(feature, hidden_size * num_heads, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_weight')) left_a = L.create_parameter(shape=[num_heads, hidden_size], dtype='float32', name=name + '_gat_l_A') right_a = L.create_parameter(shape=[num_heads, hidden_size], dtype='float32', name=name + '_gat_r_A') reshape_ft = L.reshape(ft, [-1, num_heads, hidden_size]) left_a_value = L.reduce_sum(reshape_ft * left_a, -1) right_a_value = L.reduce_sum(reshape_ft * right_a, -1) msg = gw.send(send_attention, nfeat_list=[("h", ft), ("left_a", left_a_value), ("right_a", right_a_value)]) output = gw.recv(msg, reduce_attention) bias = L.create_parameter(shape=[hidden_size * num_heads], dtype='float32', is_bias=True, name=name + '_bias') bias.stop_gradient = True output = L.elementwise_add(output, bias, act=activation) return output
def gin(gw, feature, hidden_size, activation, name, init_eps=0.0, train_eps=False): """Implementation of Graph Isomorphism Network (GIN) layer. This is an implementation of the paper How Powerful are Graph Neural Networks? (https://arxiv.org/pdf/1810.00826.pdf). In their implementation, all MLPs have 2 layers. Batch normalization is applied on every hidden layer. Args: gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). name: GIN layer names. hidden_size: The hidden size for gin. activation: The activation for the output. init_eps: float, optional Initial :math:`\epsilon` value, default is 0. train_eps: bool, optional if True, :math:`\epsilon` will be a learnable parameter. Return: A tensor with shape (num_nodes, hidden_size). """ def send_src_copy(src_feat, dst_feat, edge_feat): return src_feat["h"] epsilon = L.create_parameter( shape=[1, 1], dtype="float32", attr=fluid.ParamAttr(name="%s_eps" % name), default_initializer=fluid.initializer.ConstantInitializer( value=init_eps)) if not train_eps: epsilon.stop_gradient = True msg = gw.send(send_src_copy, nfeat_list=[("h", feature)]) output = gw.recv(msg, "sum") + feature * (epsilon + 1.0) output = L.fc(output, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name="%s_w_0" % name), bias_attr=fluid.ParamAttr(name="%s_b_0" % name)) output = L.layer_norm( output, begin_norm_axis=1, param_attr=fluid.ParamAttr( name="norm_scale_%s" % (name), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr(name="norm_bias_%s" % (name), initializer=fluid.initializer.Constant(0.0)), ) if activation is not None: output = getattr(L, activation)(output) output = L.fc(output, size=hidden_size, act=activation, param_attr=fluid.ParamAttr(name="%s_w_1" % name), bias_attr=fluid.ParamAttr(name="%s_b_1" % name)) return output
def __init__(self, num_features, eps=1e-5): super(ILN, self).__init__() self.eps = eps self.rho = layers.create_parameter(shape=[1, num_features, 1, 1], dtype='float32', default_initializer=fluid.initializer.Constant(0.0)) self.gamma = layers.create_parameter(shape=[1, num_features, 1, 1], dtype='float32', default_initializer=fluid.initializer.Constant(1.0)) self.beta = layers.create_parameter(shape=[1, num_features, 1, 1], dtype='float32', default_initializer=fluid.initializer.Constant(0.0))
def forward(self): """Build the GATNE net. """ param_attr_init = fluid.initializer.Uniform( low=-1.0, high=1.0, seed=np.random.randint(100)) embed_param_attrs = fluid.ParamAttr(name='Base_node_embed', initializer=param_attr_init) # node_embeddings base_node_embed = fl.embedding( input=fl.reshape(self.train_inputs, shape=[-1, 1]), size=[self.num_nodes, self.embedding_size], param_attr=embed_param_attrs) node_features = [] for edge_type in self.edge_types: param_attr_init = fluid.initializer.Uniform( low=-1.0, high=1.0, seed=np.random.randint(100)) embed_param_attrs = fluid.ParamAttr(name='%s_node_embed' % edge_type, initializer=param_attr_init) features = fl.embedding( input=self.gw[edge_type].node_feat['index'], size=[self.num_nodes, self.embedding_u_size], param_attr=embed_param_attrs) node_features.append(features) # mp_output: list of embedding(self.num_nodes, dim) mp_output = self.message_passing(self.gw, self.edge_types, node_features) # U : (num_type[m], num_nodes, dim[s]) node_type_embed = fl.stack(mp_output, axis=0) # U : (num_nodes, num_type[m], dim[s]) node_type_embed = fl.transpose(node_type_embed, perm=[1, 0, 2]) #gather node_type_embed from train_inputs node_type_embed = fl.gather(node_type_embed, self.train_inputs) # M_r trans_weights = fl.create_parameter( shape=[ self.edge_type_count, self.embedding_u_size, self.embedding_size // self.att_head ], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w') # W_r trans_weights_s1 = fl.create_parameter( shape=[self.edge_type_count, self.embedding_u_size, self.dim_a], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w_s1') # w_r trans_weights_s2 = fl.create_parameter( shape=[self.edge_type_count, self.dim_a, self.att_head], attr=fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)), dtype='float32', name='trans_w_s2') trans_w = fl.gather(trans_weights, self.train_types) trans_w_s1 = fl.gather(trans_weights_s1, self.train_types) trans_w_s2 = fl.gather(trans_weights_s2, self.train_types) attention = self.attention(node_type_embed, trans_w_s1, trans_w_s2) node_type_embed = fl.matmul(attention, node_type_embed) node_embed = base_node_embed + fl.reshape( fl.matmul(node_type_embed, trans_w), [-1, self.embedding_size]) self.last_node_embed = fl.l2_normalize(node_embed, axis=1) nce_weight_initializer = fluid.initializer.TruncatedNormalInitializer( loc=0.0, scale=1.0 / math.sqrt(self.embedding_size)) nce_weight_attrs = fluid.ParamAttr(name='nce_weight', initializer=nce_weight_initializer) weight_pos = fl.embedding(input=self.train_labels, size=[self.num_nodes, self.embedding_size], param_attr=nce_weight_attrs) weight_neg = fl.embedding(input=self.train_negs, size=[self.num_nodes, self.embedding_size], param_attr=nce_weight_attrs) tmp_node_embed = fl.unsqueeze(self.last_node_embed, axes=[1]) pos_logits = fl.matmul(tmp_node_embed, weight_pos, transpose_y=True) # [B, 1, 1] neg_logits = fl.matmul(tmp_node_embed, weight_neg, transpose_y=True) # [B, 1, neg_num] pos_score = fl.squeeze(pos_logits, axes=[1]) pos_score = fl.clip(pos_score, min=-10, max=10) pos_score = -1.0 * fl.logsigmoid(pos_score) neg_score = fl.squeeze(neg_logits, axes=[1]) neg_score = fl.clip(neg_score, min=-10, max=10) neg_score = -1.0 * fl.logsigmoid(-1.0 * neg_score) neg_score = fl.reduce_sum(neg_score, dim=1, keep_dim=True) self.loss = fl.reduce_mean(pos_score + neg_score)
def lm_model(hidden_size, vocab_size, batch_size, num_layers=2, num_steps=20, init_scale=0.1, dropout=None, rnn_model='static', use_py_reader=False): def padding_rnn(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape(pre_hidden, shape=[-1, hidden_size]) pre_cell = layers.reshape(pre_cell, shape=[-1, hidden_size]) hidden_array.append(pre_hidden) cell_array.append(pre_cell) input_embedding = layers.transpose(input_embedding, perm=[1, 0, 2]) rnn = PaddingRNN() with rnn.step(): input = rnn.step_input(input_embedding) for k in range(num_layers): pre_hidden = rnn.memory(init=hidden_array[k]) pre_cell = rnn.memory(init=cell_array[k]) weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i = layers.slice( gate_input, axes=[1], starts=[0], ends=[hidden_size]) j = layers.slice( gate_input, axes=[1], starts=[hidden_size], ends=[hidden_size * 2]) f = layers.slice( gate_input, axes=[1], starts=[hidden_size * 2], ends=[hidden_size * 3]) o = layers.slice( gate_input, axes=[1], starts=[hidden_size * 3], ends=[hidden_size * 4]) c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) rnn.update_memory(pre_hidden, m) rnn.update_memory(pre_cell, c) rnn.step_output(m) rnn.step_output(c) input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') rnn.step_output(input) rnnout = rnn() last_hidden_array = [] last_cell_array = [] real_res = rnnout[-1] for i in range(num_layers): m = rnnout[i * 2] c = rnnout[i * 2 + 1] m.stop_gradient = True c.stop_gradient = True last_h = layers.slice( m, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_hidden_array.append(last_h) last_c = layers.slice( c, axes=[0], starts=[num_steps - 1], ends=[num_steps]) last_cell_array.append(last_c) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = layers.concat(last_hidden_array, 0) last_cell = layers.concat(last_cell_array, 0) return real_res, last_hidden, last_cell def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape( pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape( pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split( input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) try: from paddle.fluid.contrib.layers import fused_elemwise_activation # fluid.contrib.layers.fused_elemwise_activation can do a fused # operation, like: # 1) x + sigmoid(y); x + tanh(y) # 2) tanh(x + y) # Now the unary operation supported in this fused op is limit, and # we will extent this operation to support more unary operations and # do this kind of fusion automitically in future version of paddle.fluid. # layers.sigmoid(i) * layers.tanh(j) tmp0 = fused_elemwise_activation( x=layers.tanh(j), y=i, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) # pre_cell * layers.sigmoid(f) tmp1 = fused_elemwise_activation( x=pre_cell, y=f, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) c = tmp0 + tmp1 # layers.tanh(c) * layers.sigmoid(o) m = fused_elemwise_activation( x=layers.tanh(c), y=o, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) except ImportError: c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape( last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape( real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell batch_size_each = batch_size if use_py_reader: feed_shapes = [[batch_size_each, num_steps, 1], [batch_size_each * num_steps, 1]] py_reader = fluid.layers.py_reader( capacity=16, shapes=feed_shapes, dtypes=['int64', 'int64']) x, y = fluid.layers.read_file(py_reader) else: x = layers.data( name="x", shape=[batch_size_each, num_steps, 1], dtype='int64', append_batch_size=False) y = layers.data( name="y", shape=[batch_size_each * num_steps, 1], dtype='int64', append_batch_size=False) init_hidden = layers.data( name="init_hidden", shape=[num_layers, batch_size_each, hidden_size], dtype='float32', append_batch_size=False) init_cell = layers.data( name="init_cell", shape=[num_layers, batch_size_each, hidden_size], dtype='float32', append_batch_size=False) init_cell.persistable = True init_hidden.persistable = True init_hidden_reshape = layers.reshape( init_hidden, shape=[num_layers, -1, hidden_size]) init_cell_reshape = layers.reshape( init_cell, shape=[num_layers, -1, hidden_size]) x_emb = layers.embedding( input=x, size=[vocab_size, hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale))) x_emb = layers.reshape( x_emb, shape=[-1, num_steps, hidden_size], inplace=True) if dropout != None and dropout > 0.0: x_emb = layers.dropout( x_emb, dropout_prob=dropout, dropout_implementation='upscale_in_train') if rnn_model == "padding": rnn_out, last_hidden, last_cell = padding_rnn( x_emb, len=num_steps, init_hidden=init_hidden_reshape, init_cell=init_cell_reshape) elif rnn_model == "static": rnn_out, last_hidden, last_cell = encoder_static( x_emb, len=num_steps, init_hidden=init_hidden_reshape, init_cell=init_cell_reshape) elif rnn_model == "cudnn": x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) rnn_out, last_hidden, last_cell = layers.lstm( x_emb, init_hidden_reshape, init_cell_reshape, num_steps, hidden_size, num_layers, is_bidirec=False, default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) rnn_out = layers.transpose(rnn_out, perm=[1, 0, 2]) elif rnn_model == "basic_lstm": rnn_out, last_hidden, last_cell = basic_lstm( x_emb, init_hidden, init_cell, hidden_size, \ num_layers=num_layers, batch_first=True, dropout_prob=dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-init_scale, high=init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) ), \ forget_bias = 0.0) else: print("type not support") return rnn_out = layers.reshape( rnn_out, shape=[-1, num_steps, hidden_size], inplace=True) softmax_weight = layers.create_parameter( [hidden_size, vocab_size], dtype="float32", name="softmax_weight", default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) softmax_bias = layers.create_parameter( [vocab_size], dtype="float32", name='softmax_bias', default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) projection = layers.matmul(rnn_out, softmax_weight) projection = layers.elementwise_add(projection, softmax_bias) projection = layers.reshape( projection, shape=[-1, vocab_size], inplace=True) loss = layers.softmax_with_cross_entropy( logits=projection, label=y, soft_label=False) loss = layers.reshape(loss, shape=[-1, num_steps], inplace=True) loss = layers.reduce_mean(loss, dim=[0]) loss = layers.reduce_sum(loss) loss.persistable = True last_cell.persistable = True last_hidden.persistable = True # This will feed last_hidden, last_cell to init_hidden, init_cell, which # can be used directly in next batch. This can avoid the fetching of # last_hidden and last_cell and feeding of init_hidden and init_cell in # each training step. layers.assign(input=last_cell, output=init_cell) layers.assign(input=last_hidden, output=init_hidden) feeding_list = ['x', 'y', 'init_hidden', 'init_cell'] if use_py_reader: return loss, last_hidden, last_cell, feeding_list, py_reader else: return loss, last_hidden, last_cell, feeding_list
def _build_decoder(self, enc_last_hidden, enc_last_cell, mode='train', beam_size=10): softmax_weight = layers.create_parameter([self.hidden_size, self.tar_vocab_size], dtype="float32", name="softmax_weight", \ default_initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale)) if mode == 'train': dec_output, dec_last_hidden, dec_last_cell = basic_lstm( self.tar_emb, enc_last_hidden, enc_last_cell, \ self.hidden_size, num_layers=self.num_layers, \ batch_first=self.batch_first, \ dropout_prob=self.dropout, \ param_attr = ParamAttr( initializer=fluid.initializer.UniformInitializer(low=-self.init_scale, high=self.init_scale) ), \ bias_attr = ParamAttr( initializer = fluid.initializer.Constant(0.0) )) dec_output = layers.matmul(dec_output, softmax_weight) return dec_output elif mode == 'beam_search' or mode == 'greedy_search': dec_unit_list = [] name = 'basic_lstm' for i in range(self.num_layers): new_name = name + "_layers_" + str(i) dec_unit_list.append( BasicLSTMUnit(new_name, self.hidden_size, dtype='float32')) def decoder_step(current_in, pre_hidden_array, pre_cell_array): new_hidden_array = [] new_cell_array = [] step_in = current_in for i in range(self.num_layers): pre_hidden = pre_hidden_array[i] pre_cell = pre_cell_array[i] new_hidden, new_cell = dec_unit_list[i](step_in, pre_hidden, pre_cell) new_hidden_array.append(new_hidden) new_cell_array.append(new_cell) step_in = new_hidden return step_in, new_hidden_array, new_cell_array if mode == 'beam_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) #eos_ids = layers.fill_constant( [1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([beam_size, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append( layers.expand(enc_last_hidden[i], [beam_size, 1])) pre_cell_array.append( layers.expand(enc_last_cell[i], [beam_size, 1])) eos_ids = layers.fill_constant([beam_size], dtype='int64', value=2) init_score = np.zeros((beam_size)).astype('float32') init_score[1:] = -INF pre_score = layers.assign(init_score) #pre_score = layers.fill_constant( [1,], dtype='float32', value= 0.0) tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) enc_memory = layers.expand(self.enc_output, [beam_size, 1, 1]) pre_tokens = layers.fill_constant([beam_size, 1], dtype='int64', value=1) finished_seq = layers.fill_constant([beam_size, 1], dtype='int64', value=0) finished_scores = layers.fill_constant([beam_size], dtype='float32', value=-INF) finished_flag = layers.fill_constant([beam_size], dtype='float32', value=0.0) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True parent_idx = layers.fill_constant([1], dtype='int32', value=0) while_op = layers.While(cond) def compute_topk_scores_and_seq(sequences, scores, scores_to_gather, flags, beam_size, select_beam=None, generate_id=None): scores = layers.reshape(scores, shape=[1, -1]) _, topk_indexs = layers.topk(scores, k=beam_size) topk_indexs = layers.reshape(topk_indexs, shape=[-1]) # gather result top_seq = layers.gather(sequences, topk_indexs) topk_flags = layers.gather(flags, topk_indexs) topk_gather_scores = layers.gather(scores_to_gather, topk_indexs) if select_beam: topk_beam = layers.gather(select_beam, topk_indexs) else: topk_beam = select_beam if generate_id: topk_id = layers.gather(generate_id, topk_indexs) else: topk_id = generate_id return top_seq, topk_gather_scores, topk_flags, topk_beam, topk_id def grow_alive(curr_seq, curr_scores, curr_log_probs, curr_finished, select_beam, generate_id): curr_scores += curr_finished * -INF return compute_topk_scores_and_seq(curr_seq, curr_scores, curr_log_probs, curr_finished, beam_size, select_beam, generate_id=generate_id) def grow_finished(finished_seq, finished_scores, finished_flag, curr_seq, curr_scores, curr_finished): finished_seq = layers.concat([ finished_seq, layers.fill_constant( [beam_size, 1], dtype='int64', value=1) ], axis=1) curr_scores += (1.0 - curr_finished) * -INF #layers.Print( curr_scores, message="curr scores") curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=0) curr_finished_scores = layers.concat( [finished_scores, curr_scores], axis=0) curr_finished_flags = layers.concat( [finished_flag, curr_finished], axis=0) return compute_topk_scores_and_seq(curr_finished_seq, curr_finished_scores, curr_finished_scores, curr_finished_flags, beam_size) def is_finished(alive_log_prob, finished_scores, finished_in_finished): max_out_len = 200 max_length_penalty = layers.pow( layers.fill_constant([1], dtype='float32', value=((5.0 + max_out_len) / 6.0)), alpha) lower_bound_alive_score = layers.slice( alive_log_prob, starts=[0], ends=[1], axes=[0]) / max_length_penalty lowest_score_of_fininshed_in_finished = finished_scores * finished_in_finished lowest_score_of_fininshed_in_finished += ( 1.0 - finished_in_finished) * -INF lowest_score_of_fininshed_in_finished = layers.reduce_min( lowest_score_of_fininshed_in_finished) met = layers.less_than( lower_bound_alive_score, lowest_score_of_fininshed_in_finished) met = layers.cast(met, 'float32') bound_is_met = layers.reduce_sum(met) finished_eos_num = layers.reduce_sum(finished_in_finished) finish_cond = layers.less_than( finished_eos_num, layers.fill_constant([1], dtype='float32', value=beam_size)) return finish_cond def grow_top_k(step_idx, alive_seq, alive_log_prob, parant_idx): pre_ids = alive_seq dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) current_log = layers.elementwise_add(x=layers.log(logits), y=alive_log_prob, axis=0) base_1 = layers.cast(step_idx, 'float32') + 6.0 base_1 /= 6.0 length_penalty = layers.pow(base_1, alpha) len_pen = layers.pow( ((5. + layers.cast(step_idx + 1, 'float32')) / 6.), alpha) current_log = layers.reshape(current_log, shape=[1, -1]) current_log = current_log / length_penalty topk_scores, topk_indices = layers.topk(input=current_log, k=beam_size) topk_scores = layers.reshape(topk_scores, shape=[-1]) topk_log_probs = topk_scores * length_penalty generate_id = layers.reshape( topk_indices, shape=[-1]) % self.tar_vocab_size selected_beam = layers.reshape( topk_indices, shape=[-1]) // self.tar_vocab_size topk_finished = layers.equal(generate_id, eos_ids) topk_finished = layers.cast(topk_finished, 'float32') generate_id = layers.reshape(generate_id, shape=[-1, 1]) pre_tokens_list = layers.gather(tokens, selected_beam) full_tokens_list = layers.concat( [pre_tokens_list, generate_id], axis=1) return full_tokens_list, topk_log_probs, topk_scores, topk_finished, selected_beam, generate_id, \ dec_att_out, new_hidden_array, new_cell_array with while_op.block(): topk_seq, topk_log_probs, topk_scores, topk_finished, topk_beam, topk_generate_id, attention_out, new_hidden_array, new_cell_array = \ grow_top_k( step_idx, pre_tokens, pre_score, parent_idx) alive_seq, alive_log_prob, _, alive_beam, alive_id = grow_alive( topk_seq, topk_scores, topk_log_probs, topk_finished, topk_beam, topk_generate_id) finished_seq_2, finished_scores_2, finished_flags_2, _, _ = grow_finished( finished_seq, finished_scores, finished_flag, topk_seq, topk_scores, topk_finished) finished_cond = is_finished(alive_log_prob, finished_scores_2, finished_flags_2) layers.increment(x=step_idx, value=1.0, in_place=True) layers.assign(alive_beam, parent_idx) layers.assign(alive_id, pre_tokens) layers.assign(alive_log_prob, pre_score) layers.assign(alive_seq, tokens) layers.assign(finished_seq_2, finished_seq) layers.assign(finished_scores_2, finished_scores) layers.assign(finished_flags_2, finished_flag) # update init_hidden, init_cell, input_feed new_feed = layers.gather(attention_out, parent_idx) layers.assign(new_feed, pre_feed) for i in range(self.num_layers): new_hidden_var = layers.gather(new_hidden_array[i], parent_idx) layers.assign(new_hidden_var, pre_hidden_array[i]) new_cell_var = layers.gather(new_cell_array[i], parent_idx) layers.assign(new_cell_var, pre_cell_array[i]) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=finished_cond, out=cond) tokens_with_eos = tokens all_seq = layers.concat([tokens_with_eos, finished_seq], axis=0) all_score = layers.concat([pre_score, finished_scores], axis=0) _, topk_index = layers.topk(all_score, k=beam_size) topk_index = layers.reshape(topk_index, shape=[-1]) final_seq = layers.gather(all_seq, topk_index) final_score = layers.gather(all_score, topk_index) return final_seq elif mode == 'greedy_search': max_src_seq_len = layers.shape(self.src)[1] max_length = max_src_seq_len * 2 #max_length = layers.fill_constant( [1], dtype='int32', value = 10) pre_ids = layers.fill_constant([1, 1], dtype='int64', value=1) full_ids = layers.fill_constant([1, 1], dtype='int64', value=1) score = layers.fill_constant([1], dtype='float32', value=0.0) eos_ids = layers.fill_constant([1, 1], dtype='int64', value=2) pre_hidden_array = [] pre_cell_array = [] pre_feed = layers.fill_constant([1, self.hidden_size], dtype='float32', value=0) for i in range(self.num_layers): pre_hidden_array.append(enc_last_hidden[i]) pre_cell_array.append(enc_last_cell[i]) #pre_hidden_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) #pre_cell_array.append( layers.fill_constant( [1, hidden_size], dtype='float32', value=0) ) step_idx = layers.fill_constant(shape=[1], dtype='int32', value=0) cond = layers.less_than(x=step_idx, y=max_length) # default force_cpu=True while_op = layers.While(cond) with while_op.block(): dec_step_emb = layers.embedding( input=pre_ids, size=[self.tar_vocab_size, self.hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='target_embedding', initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))) dec_att_out, new_hidden_array, new_cell_array = decoder_step( dec_step_emb, pre_hidden_array, pre_cell_array) projection = layers.matmul(dec_att_out, softmax_weight) logits = layers.softmax(projection) logits = layers.log(logits) current_log = layers.elementwise_add(logits, score, axis=0) topk_score, topk_indices = layers.topk(input=current_log, k=1) new_ids = layers.concat([full_ids, topk_indices]) layers.assign(new_ids, full_ids) #layers.Print( full_ids, message="ful ids") layers.assign(topk_score, score) layers.assign(topk_indices, pre_ids) layers.assign(dec_att_out, pre_feed) for i in range(self.num_layers): layers.assign(new_hidden_array[i], pre_hidden_array[i]) layers.assign(new_cell_array[i], pre_cell_array[i]) layers.increment(x=step_idx, value=1.0, in_place=True) eos_met = layers.not_equal(topk_indices, eos_ids) length_cond = layers.less_than(x=step_idx, y=max_length) layers.logical_and(x=length_cond, y=eos_met, out=cond) return full_ids raise Exception("error") else: print("mode not supprt", mode)
def encoder_static(input_embedding, len=3, init_hidden=None, init_cell=None): weight_1_arr = [] weight_2_arr = [] bias_arr = [] hidden_array = [] cell_array = [] mask_array = [] for i in range(num_layers): weight_1 = layers.create_parameter( [hidden_size * 2, hidden_size * 4], dtype="float32", name="fc_weight1_" + str(i), default_initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale)) weight_1_arr.append(weight_1) bias_1 = layers.create_parameter( [hidden_size * 4], dtype="float32", name="fc_bias1_" + str(i), default_initializer=fluid.initializer.Constant(0.0)) bias_arr.append(bias_1) pre_hidden = layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = layers.reshape( pre_hidden, shape=[-1, hidden_size], inplace=True) pre_cell = layers.reshape( pre_cell, shape=[-1, hidden_size], inplace=True) hidden_array.append(pre_hidden) cell_array.append(pre_cell) res = [] sliced_inputs = layers.split( input_embedding, num_or_sections=len, dim=1) for index in range(len): input = sliced_inputs[index] input = layers.reshape(input, shape=[-1, hidden_size], inplace=True) for k in range(num_layers): pre_hidden = hidden_array[k] pre_cell = cell_array[k] weight_1 = weight_1_arr[k] bias = bias_arr[k] nn = layers.concat([input, pre_hidden], 1) gate_input = layers.matmul(x=nn, y=weight_1) gate_input = layers.elementwise_add(gate_input, bias) i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1) try: from paddle.fluid.contrib.layers import fused_elemwise_activation # fluid.contrib.layers.fused_elemwise_activation can do a fused # operation, like: # 1) x + sigmoid(y); x + tanh(y) # 2) tanh(x + y) # Now the unary operation supported in this fused op is limit, and # we will extent this operation to support more unary operations and # do this kind of fusion automitically in future version of paddle.fluid. # layers.sigmoid(i) * layers.tanh(j) tmp0 = fused_elemwise_activation( x=layers.tanh(j), y=i, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) # pre_cell * layers.sigmoid(f) tmp1 = fused_elemwise_activation( x=pre_cell, y=f, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) c = tmp0 + tmp1 # layers.tanh(c) * layers.sigmoid(o) m = fused_elemwise_activation( x=layers.tanh(c), y=o, functor_list=['elementwise_mul', 'sigmoid'], save_intermediate_out=False) except ImportError: c = pre_cell * layers.sigmoid(f) + layers.sigmoid( i) * layers.tanh(j) m = layers.tanh(c) * layers.sigmoid(o) hidden_array[k] = m cell_array[k] = c input = m if dropout != None and dropout > 0.0: input = layers.dropout( input, dropout_prob=dropout, dropout_implementation='upscale_in_train') res.append(input) last_hidden = layers.concat(hidden_array, 1) last_hidden = layers.reshape( last_hidden, shape=[-1, num_layers, hidden_size], inplace=True) last_hidden = layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = layers.concat(cell_array, 1) last_cell = layers.reshape( last_cell, shape=[-1, num_layers, hidden_size]) last_cell = layers.transpose(x=last_cell, perm=[1, 0, 2]) real_res = layers.concat(res, 0) real_res = layers.reshape( real_res, shape=[len, -1, hidden_size], inplace=True) real_res = layers.transpose(x=real_res, perm=[1, 0, 2]) return real_res, last_hidden, last_cell
def temporal_conv_layer(self, x, Kt, c_in, c_out, name, act_func='relu'): """Temporal convolution layer""" _, T, n, _ = x.shape if c_in > c_out: x_input = fl.conv2d(input=x, num_filters=c_out, filter_size=[1, 1], stride=[1, 1], padding="SAME", data_format="NHWC", param_attr=fluid.ParamAttr(name="%s_conv2d_1" % name)) elif c_in < c_out: # if the size of input channel is less than the output, # padding x to the same size of output channel. pad = fl.fill_constant_batch_size_like( input=x, shape=[-1, T, n, c_out - c_in], dtype="float32", value=0.0) x_input = fl.concat([x, pad], axis=3) else: x_input = x # x_input = x_input[:, Kt - 1:T, :, :] if act_func == 'GLU': # gated liner unit bt_init = fluid.initializer.ConstantInitializer(value=0.0) bt = fl.create_parameter( shape=[2 * c_out], dtype="float32", attr=fluid.ParamAttr(name="%s_bt" % name, trainable=True, initializer=bt_init), ) x_conv = fl.conv2d(input=x, num_filters=2 * c_out, filter_size=[Kt, 1], stride=[1, 1], padding="SAME", data_format="NHWC", param_attr=fluid.ParamAttr(name="%s_conv2d_wt" % name)) x_conv = x_conv + bt return (x_conv[:, :, :, 0:c_out] + x_input) * fl.sigmoid( x_conv[:, :, :, -c_out:]) else: bt_init = fluid.initializer.ConstantInitializer(value=0.0) bt = fl.create_parameter( shape=[c_out], dtype="float32", attr=fluid.ParamAttr(name="%s_bt" % name, trainable=True, initializer=bt_init), ) x_conv = fl.conv2d(input=x, num_filters=c_out, filter_size=[Kt, 1], stride=[1, 1], padding="SAME", data_format="NHWC", param_attr=fluid.ParamAttr(name="%s_conv2d_wt" % name)) x_conv = x_conv + bt if act_func == "linear": return x_conv elif act_func == "sigmoid": return fl.sigmoid(x_conv) elif act_func == "relu": return fl.relu(x_conv + x_input) else: raise ValueError( f'ERROR: activation function "{act_func}" is not defined.')
def net(self, inputs, is_infer=False): if is_infer: bs = self.evaluate_batch_size else: bs = self.train_batch_size stdv = 1.0 / math.sqrt(self.hidden_size) def embedding_layer(input, table_name, emb_dim, initializer_instance=None): emb = fluid.embedding( input=input, size=[self.dict_size, emb_dim], param_attr=fluid.ParamAttr( name=table_name, initializer=initializer_instance)) return emb sparse_initializer = fluid.initializer.Uniform(low=-stdv, high=stdv) items_emb = embedding_layer(inputs[0], "emb", self.hidden_size, sparse_initializer) pre_state = items_emb for i in range(self.step): pre_state = layers.reshape( x=pre_state, shape=[bs, -1, self.hidden_size]) state_in = layers.fc( input=pre_state, name="state_in", size=self.hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_out = layers.fc( input=pre_state, name="state_out", size=self.hidden_size, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, uniq_max, h] state_adj_in = layers.matmul(inputs[3], state_in) # [batch_size, uniq_max, h] state_adj_out = layers.matmul( inputs[4], state_out) # [batch_size, uniq_max, h] gru_input = layers.concat([state_adj_in, state_adj_out], axis=2) gru_input = layers.reshape( x=gru_input, shape=[-1, self.hidden_size * 2]) gru_fc = layers.fc(input=gru_input, name="gru_fc", size=3 * self.hidden_size, bias_attr=False) pre_state, _, _ = fluid.layers.gru_unit( input=gru_fc, hidden=layers.reshape( x=pre_state, shape=[-1, self.hidden_size]), size=3 * self.hidden_size) final_state = layers.reshape( pre_state, shape=[bs, -1, self.hidden_size]) seq = layers.gather_nd(final_state, inputs[1]) last = layers.gather_nd(final_state, inputs[2]) seq_fc = layers.fc( input=seq, name="seq_fc", size=self.hidden_size, bias_attr=False, act=None, num_flatten_dims=2, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, h] last_fc = layers.fc(input=last, name="last_fc", size=self.hidden_size, bias_attr=False, act=None, num_flatten_dims=1, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [bathc_size, h] seq_fc_t = layers.transpose( seq_fc, perm=[1, 0, 2]) # [seq_max, batch_size, h] add = layers.elementwise_add(seq_fc_t, last_fc) # [seq_max, batch_size, h] b = layers.create_parameter( shape=[self.hidden_size], dtype='float32', default_initializer=fluid.initializer.Constant(value=0.0)) # [h] add = layers.elementwise_add(add, b) # [seq_max, batch_size, h] add_sigmoid = layers.sigmoid(add) # [seq_max, batch_size, h] add_sigmoid = layers.transpose( add_sigmoid, perm=[1, 0, 2]) # [batch_size, seq_max, h] weight = layers.fc( input=add_sigmoid, name="weight_fc", size=1, act=None, num_flatten_dims=2, bias_attr=False, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, seq_max, 1] weight *= inputs[5] weight_mask = layers.elementwise_mul( seq, weight, axis=0) # [batch_size, seq_max, h] global_attention = layers.reduce_sum( weight_mask, dim=1) # [batch_size, h] final_attention = layers.concat( [global_attention, last], axis=1) # [batch_size, 2*h] final_attention_fc = layers.fc( input=final_attention, name="final_attention_fc", size=self.hidden_size, bias_attr=False, act=None, param_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform( low=-stdv, high=stdv))) # [batch_size, h] # all_vocab = layers.create_global_var( # shape=[items_num - 1], # value=0, # dtype="int64", # persistable=True, # name="all_vocab") all_vocab = np.arange(1, self.dict_size).reshape((-1)).astype('int32') all_vocab = fluid.layers.cast( x=fluid.layers.assign(all_vocab), dtype='int64') all_emb = fluid.embedding( input=all_vocab, param_attr=fluid.ParamAttr( name="emb", initializer=fluid.initializer.Uniform( low=-stdv, high=stdv)), size=[self.dict_size, self.hidden_size]) # [all_vocab, h] logits = layers.matmul( x=final_attention_fc, y=all_emb, transpose_y=True) # [batch_size, all_vocab] softmax = layers.softmax_with_cross_entropy( logits=logits, label=inputs[6]) # [batch_size, 1] self.loss = layers.reduce_mean(softmax) # [1] self.acc = layers.accuracy(input=logits, label=inputs[6], k=20) self._cost = self.loss if is_infer: self._infer_results['acc'] = self.acc self._infer_results['loss'] = self.loss return self._metrics["LOSS"] = self.loss self._metrics["train_acc"] = self.acc
def multi_head_attention(queries, keys, values, attn_bias, structure_mask, with_ent_structure, d_key, d_value, d_model, n_head=1, dropout_rate=0., cache=None, param_initializer=None, name='multi_head_att'): """ Multi-Head Attention. Note that attn_bias is added to the logit before computing softmax activiation to mask certain selected positions so that they will not considered in attention weights. """ keys = queries if keys is None else keys values = keys if values is None else values if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): raise ValueError( "Inputs: quries, keys and values should all be 3-D tensors.") def __compute_qkv(queries, keys, values, n_head, d_key, d_value): """ Add linear projection to queries, keys, and values. """ q = layers.fc(input=queries, size=d_key * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_query_fc.w_0', initializer=param_initializer), bias_attr=name + '_query_fc.b_0') k = layers.fc(input=keys, size=d_key * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_key_fc.w_0', initializer=param_initializer), bias_attr=name + '_key_fc.b_0') v = layers.fc(input=values, size=d_value * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_value_fc.w_0', initializer=param_initializer), bias_attr=name + '_value_fc.b_0') return q, k, v def __split_heads(x, n_head): """ Reshape the last dimension of inpunt tensor x so that it becomes two dimensions and then transpose. Specifically, input a tensor with shape [bs, max_sequence_length, n_head * hidden_dim] then output a tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. """ hidden_size = x.shape[-1] # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. reshaped = layers.reshape( x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ Transpose and then reshape the last two dimensions of inpunt tensor x so that it becomes one dimension, which is reverse to __split_heads. """ if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. return layers.reshape( x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) def scaled_dot_product_attention(q, k, v, attn_bias, biaffine_transformation, biaffine_transformation_bias, structure_mask, with_ent_structure, d_key, dropout_rate): """ Scaled Dot-Product Attention """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if with_ent_structure: # TRANSFORMATION # 1.reshape input # q: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden] # transformation: [dependencies(5), n_head, hidden, hidden] -> [1, dependencies(5), n_head, hidden, hidden] # k: [bs, n_head, seq, hidden] -> [bs, 1, n_head, seq, hidden] q_ = layers.unsqueeze(scaled_q, [1]) q_ = layers.expand(q_, [1, biaffine_transformation.shape[0], 1, 1, 1]) biaffine_transformation_ = layers.unsqueeze(biaffine_transformation, [0]) biaffine_transformation_ = layers.expand(biaffine_transformation_, [q_.shape[0], 1, 1, 1, 1]) k_ = layers.unsqueeze(k, [1]) k_ = layers.expand(k_, [1, biaffine_transformation.shape[0], 1, 1, 1]) # 2.implement matmul # q * transformation: [bs, dependencies(5), n_head, seq, hidden] # q * transformation * k: [bs, dependencies(5), n_head, seq, seq] structured_bias = layers.matmul(x=q_, y=biaffine_transformation_) structured_bias = layers.matmul(x=structured_bias, y=k_, transpose_y=True) structured_bias = layers.elementwise_add(structured_bias, biaffine_transformation_bias, axis=1) # mask & apply structured_bias = structured_bias * structure_mask structured_bias = layers.reduce_sum(structured_bias, dim=1) product += structured_bias if attn_bias: product += attn_bias weights = layers.softmax(product) if dropout_rate: weights = layers.dropout( weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) if cache is not None: # use cache and concat time steps # Since the inplace reshape in __split_heads changes the shape of k and # v, which is the cache input for next time step, reshape the cache # input from the previous time step first. k = cache["k"] = layers.concat( [layers.reshape( cache["k"], shape=[0, 0, d_model]), k], axis=1) v = cache["v"] = layers.concat( [layers.reshape( cache["v"], shape=[0, 0, d_model]), v], axis=1) q = __split_heads(q, n_head) k = __split_heads(k, n_head) v = __split_heads(v, n_head) biaffine_transformation = layers.create_parameter([5, n_head, d_key, d_key], core.VarDesc.VarType.FP32, name=name + '_biaffine_transformation', attr=None, is_bias=False, default_initializer=param_initializer) biaffine_transformation_bias = layers.create_parameter([5, n_head], core.VarDesc.VarType.FP32, name=name + '_biaffine_transformation_bias', attr=None, is_bias=False, default_initializer=param_initializer) ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, biaffine_transformation, biaffine_transformation_bias, structure_mask, with_ent_structure, d_key, dropout_rate) out = __combine_heads(ctx_multiheads) # Project back to the model size. proj_out = layers.fc(input=out, size=d_model, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_output_fc.w_0', initializer=param_initializer), bias_attr=name + '_output_fc.b_0') return proj_out