from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.fluid as fluid from paddle.fluid.param_attr import ParamAttr __all__ = ["ResNet", "ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] Trainable = True w_nolr = fluid.ParamAttr( trainable = Trainable) train_parameters = { "input_size": [3, 224, 224], "input_mean": [0.485, 0.456, 0.406], "input_std": [0.229, 0.224, 0.225], "learning_strategy": { "name": "piecewise_decay", "batch_size": 256, "epochs": [30, 60, 90], "steps": [0.1, 0.01, 0.001, 0.0001] } } class ResNet(): def __init__(self, params): self.layers = params['layers']
def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, **ignored): # 8 features predicate_embedding = fluid.layers.embedding( input=predicate, size=[pred_dict_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr='vemb') mark_embedding = fluid.layers.embedding( input=mark, size=[mark_dict_len, mark_dim], dtype='float32', is_sparse=IS_SPARSE) word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2] emb_layers = [ fluid.layers.embedding( size=[word_dict_len, word_dim], input=x, param_attr=fluid.ParamAttr( name=embedding_name, trainable=False)) for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) hidden_0_layers = [ fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers ] hidden_0 = fluid.layers.sums(input=hidden_0_layers) lstm_0 = fluid.layers.dynamic_lstm( input=hidden_0, size=hidden_dim, candidate_activation='relu', gate_activation='sigmoid', cell_activation='sigmoid') # stack L-LSTM and R-LSTM with direct edges input_tmp = [hidden_0, lstm_0] for i in range(1, depth): mix_hidden = fluid.layers.sums(input=[ fluid.layers.fc(input=input_tmp[0], size=hidden_dim), fluid.layers.fc(input=input_tmp[1], size=hidden_dim) ]) lstm = fluid.layers.dynamic_lstm( input=mix_hidden, size=hidden_dim, candidate_activation='relu', gate_activation='sigmoid', cell_activation='sigmoid', is_reverse=((i % 2) == 1)) input_tmp = [mix_hidden, lstm] feature_out = fluid.layers.sums(input=[ fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'), fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh') ]) return feature_out
def forward(self, graph_wrapper, is_test=False): """ Build the network. """ node_features = self._mol_encoder(graph_wrapper, name=self.name) features_list = [node_features] for layer in range(self.layer_num): edge_features = self._bond_encoder(graph_wrapper, name='%s_layer%s' % (self.name, layer)) if self.gnn_type == "gcn": feat = gcn_layer(graph_wrapper, features_list[layer], edge_features, act="relu", name="%s_layer%s_gcn" % (self.name, layer)) elif self.gnn_type == "gat": feat = gat_layer(graph_wrapper, features_list[layer], edge_features, self.embed_dim, act="relu", name="%s_layer%s_gat" % (self.name, layer)) else: feat = gin_layer(graph_wrapper, features_list[layer], edge_features, name="%s_layer%s_gin" % (self.name, layer)) if self.norm_type == 'batch_norm': feat = layers.batch_norm( feat, param_attr=fluid.ParamAttr( name="%s_layer%s_batch_norm_scale" % (self.name, layer), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr( name="%s_layer%s_batch_norm_bias" % (self.name, layer), initializer=fluid.initializer.Constant(0.0)), moving_mean_name="%s_layer%s_batch_norm_moving_avearage" % (self.name, layer), moving_variance_name="%s_layer%s_batch_norm_moving_variance" % (self.name, layer), is_test=is_test) elif self.norm_type == 'layer_norm': feat = layers.layer_norm( feat, param_attr=fluid.ParamAttr( name="%s_layer%s_layer_norm_scale" % (self.name, layer), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr( name="%s_layer%s_layer_norm_bias" % (self.name, layer), initializer=fluid.initializer.Constant(0.0))) else: raise ValueError('%s not supported.' % self.norm_type) if self.graph_norm: feat = pgl.layers.graph_norm(graph_wrapper, feat) if layer < self.layer_num - 1: feat = layers.relu(feat) feat = layers.dropout(feat, self.dropout_rate, dropout_implementation="upscale_in_train", is_test=is_test) # residual if self.residual: feat = feat + features_list[layer] features_list.append(feat) if self.JK == "sum": node_repr = layers.reduce_sum(features_list, axis=0) elif self.JK == "mean": node_repr = layers.reduce_mean(features_list, axis=0) elif self.JK == "last": node_repr = features_list[-1] else: node_repr = features_list[-1] return node_repr
def __init__(self, num_labels, n_layer, hidden_size=768, name="encoder", search_layer=True, use_fixed_gumbel=False, gumbel_alphas=None): super(EncoderLayer, self).__init__() self._n_layer = n_layer self._hidden_size = hidden_size self._n_channel = 128 self._steps = 3 self._n_ops = len(ConvBN_PRIMITIVES) self.use_fixed_gumbel = use_fixed_gumbel self.stem0 = fluid.dygraph.Sequential( Conv2D(num_channels=1, num_filters=self._n_channel, filter_size=[3, self._hidden_size], padding=[1, 0], param_attr=fluid.ParamAttr(initializer=MSRA()), bias_attr=False), BatchNorm(num_channels=self._n_channel, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=1)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0)))) self.stem1 = fluid.dygraph.Sequential( Conv2D(num_channels=1, num_filters=self._n_channel, filter_size=[3, self._hidden_size], padding=[1, 0], param_attr=fluid.ParamAttr(initializer=MSRA()), bias_attr=False), BatchNorm(num_channels=self._n_channel, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=1)), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0)))) cells = [] for i in range(n_layer): cell = Cell(steps=self._steps, n_channel=self._n_channel, name="%s/layer_%d" % (name, i)) cells.append(cell) self._cells = fluid.dygraph.LayerList(cells) k = sum(1 for i in range(self._steps) for n in range(2 + i)) num_ops = self._n_ops self.alphas = fluid.layers.create_parameter( shape=[k, num_ops], dtype="float32", default_initializer=NormalInitializer(loc=0.0, scale=1e-3)) self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True) self.bns = [] self.outs = [] for i in range(self._n_layer): bn = BatchNorm(num_channels=self._n_channel, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=1), trainable=False), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0), trainable=False)) out = Linear(self._n_channel, num_labels, param_attr=ParamAttr(initializer=MSRA()), bias_attr=ParamAttr(initializer=MSRA())) self.bns.append(bn) self.outs.append(out) self._bns = fluid.dygraph.LayerList(self.bns) self._outs = fluid.dygraph.LayerList(self.outs) self.use_fixed_gumbel = use_fixed_gumbel #self.gumbel_alphas = gumbel_softmax(self.alphas, 0).detach() mrpc_arch = [ [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # std_conv7 0 # node 0 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # dil_conv5 1 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # std_conv7 0 # node 1 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # dil_conv5 1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 0 # node2 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], # std_conv3 1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 2 [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] # dil_conv3 3 ] self.gumbel_alphas = to_variable( np.array(mrpc_arch).astype(np.float32)) self.gumbel_alphas.stop_gradient = True print("gumbel_alphas: \n", self.gumbel_alphas.numpy())
def context(self, trainable=False, max_seq_len=128, num_slots=1): """ Get the input ,output and program of the pretrained emotion_detection_textcnn Args: trainable(bool): Whether fine-tune the pretrained parameters of emotion_detection_textcnn or not. max_seq_len (int): It will limit the total sequence returned so that it has a maximum length. num_slots(int): It's number of data inputted to the model, selectted as following options: - 1(default): There's only one data to be feeded in the model, e.g. the module is used for text classification task. - 2: There are two data to be feeded in the model, e.g. the module is used for text matching task (point-wise). - 3: There are three data to be feeded in the model, e.g. the module is used for text matching task (pair-wise). Returns: inputs(dict): the input variables of emotion_detection_textcnn (words) outputs(dict): the output variables of input words (word embeddings and label probilities); the sentence embedding and sequence length of the first input text. main_program(Program): the main_program of emotion_detection_textcnn with pretrained prameters """ assert num_slots >= 1 and num_slots <= 3, "num_slots must be 1, 2, or 3, but the input is %d" % num_slots main_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(main_program, startup_program): text_1 = fluid.layers.data(name="text", shape=[-1, max_seq_len, 1], dtype="int64", lod_level=0) seq_len = fluid.layers.data(name="seq_len", shape=[1], dtype='int64', lod_level=0) seq_len_used = fluid.layers.squeeze(seq_len, axes=[1]) # Add embedding layer. w_param_attrs = fluid.ParamAttr( name="embedding_0.w_0", initializer=fluid.initializer.TruncatedNormal(scale=0.02), trainable=trainable) dict_dim = 240466 emb_1 = fluid.layers.embedding( input=text_1, size=[dict_dim, 128], is_sparse=True, padding_idx=dict_dim - 1, dtype='float32', param_attr=w_param_attrs) emb_1_name = emb_1.name data_list = [text_1] emb_name_list = [emb_1_name] # Add lstm layer. pred, fc = textcnn_net(emb_1, seq_len_used) pred_name = pred.name fc_name = fc.name if num_slots > 1: text_2 = fluid.data(name='text_2', shape=[-1, max_seq_len], dtype='int64', lod_level=0) emb_2 = fluid.embedding( input=text_2, size=[dict_dim, 128], is_sparse=True, padding_idx=dict_dim - 1, dtype='float32', param_attr=w_param_attrs) emb_2_name = emb_2.name data_list.append(text_2) emb_name_list.append(emb_2_name) if num_slots > 2: text_3 = fluid.data(name='text_3', shape=[-1, max_seq_len], dtype='int64', lod_level=0) emb_3 = fluid.embedding( input=text_3, size=[dict_dim, 128], is_sparse=True, padding_idx=dict_dim - 1, dtype='float32', param_attr=w_param_attrs) emb_3_name = emb_3.name data_list.append(text_3) emb_name_list.append(emb_3_name) variable_names = filter(lambda v: v not in ['text', 'text_2', 'text_3', "seq_len"], list(main_program.global_block().vars.keys())) prefix_name = "@HUB_{}@".format(self.name) add_vars_prefix(program=main_program, prefix=prefix_name, vars=variable_names) for param in main_program.global_block().iter_parameters(): param.trainable = trainable place = fluid.CPUPlace() exe = fluid.Executor(place) # Load the emotion_detection_textcnn pretrained model. def if_exist(var): return os.path.exists(os.path.join(self.pretrained_model_path, var.name)) fluid.io.load_vars(exe, self.pretrained_model_path, predicate=if_exist) inputs = {'seq_len': seq_len} outputs = { "class_probs": main_program.global_block().vars[prefix_name + pred_name], "sentence_feature": main_program.global_block().vars[prefix_name + fc_name] } for index, data in enumerate(data_list): if index == 0: inputs['text'] = data outputs['emb'] = main_program.global_block().vars[prefix_name + emb_name_list[0]] else: inputs['text_%s' % (index + 1)] = data outputs['emb_%s' % (index + 1)] = main_program.global_block().vars[prefix_name + emb_name_list[index]] return inputs, outputs, main_program
def point_network_decoder(p_vec, q_vec, hidden_size, args): tag = 'pn_decoder:' init_random = fluid.initializer.Normal(loc=0.0, scale=1.0) random_attn = layers.create_parameter( shape=[1, hidden_size], dtype='float32', default_initializer=init_random) random_attn = layers.fc( input=random_attn, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'random_attn_fc_b')) random_attn = layers.reshape(random_attn, shape=[-1]) U = layers.fc(input=q_vec, param_attr=fluid.ParamAttr(name=tag + 'q_vec_fc_w'), bias_attr=False, size=hidden_size, act=None) + random_attn U = layers.tanh(U) logits = layers.fc(input=U, param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b'), size=1, act=None) scores = layers.sequence_softmax(input=logits) pooled_vec = layers.elementwise_mul(x=q_vec, y=scores, axis=0) pooled_vec = layers.sequence_pool(input=pooled_vec, pool_type='sum') init_state = layers.fc( input=pooled_vec, param_attr=fluid.ParamAttr(name=tag + 'init_state_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'init_state_fc_b'), size=hidden_size, act=None) def custom_dynamic_rnn(p_vec, init_state, hidden_size, para_name, args): tag = para_name + "custom_dynamic_rnn:" def static_rnn(step, p_vec=p_vec, init_state=None, para_name='', args=args): tag = para_name + "static_rnn:" ctx = layers.fc( input=p_vec, param_attr=fluid.ParamAttr(name=tag + 'context_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'context_fc_b'), size=hidden_size, act=None) beta = [] c_prev = init_state m_prev = init_state for i in range(step): m_prev0 = layers.fc( input=m_prev, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'm_prev0_fc_b')) m_prev1 = layers.sequence_expand(x=m_prev0, y=ctx) Fk = ctx + m_prev1 Fk = layers.tanh(Fk) logits = layers.fc( input=Fk, size=1, act=None, param_attr=fluid.ParamAttr(name=tag + 'logits_fc_w'), bias_attr=fluid.ParamAttr(name=tag + 'logits_fc_b')) scores = layers.sequence_softmax(input=logits) attn_ctx = layers.elementwise_mul(x=p_vec, y=scores, axis=0) attn_ctx = layers.sequence_pool(input=attn_ctx, pool_type='sum') hidden_t, cell_t = lstm_step( attn_ctx, hidden_t_prev=m_prev, cell_t_prev=c_prev, size=hidden_size, para_name=tag, args=args) m_prev = hidden_t c_prev = cell_t beta.append(scores) return beta return static_rnn( 2, p_vec=p_vec, init_state=init_state, para_name=para_name) fw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "fw:", args) bw_outputs = custom_dynamic_rnn(p_vec, init_state, hidden_size, tag + "bw:", args) start_prob = layers.elementwise_add( x=fw_outputs[0], y=bw_outputs[1], axis=0) / 2 end_prob = layers.elementwise_add( x=fw_outputs[1], y=bw_outputs[0], axis=0) / 2 return start_prob, end_prob
def gaan(gw, feature, hidden_size_a, hidden_size_v, hidden_size_m, hidden_size_o, heads, name): """Implementation of GaAN""" def send_func(src_feat, dst_feat, edge_feat): # compute attention # E * (M * D1) feat_query, feat_key = dst_feat['feat_query'], src_feat['feat_key'] # E * M * D1 old = feat_query feat_query = fluid.layers.reshape(feat_query, [-1, heads, hidden_size_a]) feat_key = fluid.layers.reshape(feat_key, [-1, heads, hidden_size_a]) # E * M alpha = fluid.layers.reduce_sum(feat_key * feat_query, dim=-1) return { 'dst_node_feat': dst_feat['node_feat'], 'src_node_feat': src_feat['node_feat'], 'feat_value': src_feat['feat_value'], 'alpha': alpha, 'feat_gate': src_feat['feat_gate'] } def recv_func(message): # feature of src and dst node on each edge dst_feat = message['dst_node_feat'] src_feat = message['src_node_feat'] # feature of center node x = fluid.layers.sequence_pool(dst_feat, 'average') # feature of neighbors of center node z = fluid.layers.sequence_pool(src_feat, 'average') # compute gate feat_gate = message['feat_gate'] g_max = fluid.layers.sequence_pool(feat_gate, 'max') g = fluid.layers.concat([x, g_max, z], axis=1) g = fluid.layers.fc(g, heads, bias_attr=False, act="sigmoid") # softmax alpha = message['alpha'] alpha = paddle_helper.sequence_softmax(alpha) # E * M feat_value = message['feat_value'] # E * (M * D2) old = feat_value feat_value = fluid.layers.reshape( feat_value, [-1, heads, hidden_size_v]) # E * M * D2 feat_value = fluid.layers.elementwise_mul(feat_value, alpha, axis=0) feat_value = fluid.layers.reshape( feat_value, [-1, heads * hidden_size_v]) # E * (M * D2) feat_value = fluid.layers.lod_reset(feat_value, old) feat_value = fluid.layers.sequence_pool(feat_value, 'sum') # N * (M * D2) feat_value = fluid.layers.reshape( feat_value, [-1, heads, hidden_size_v]) # N * M * D2 output = fluid.layers.elementwise_mul(feat_value, g, axis=0) output = fluid.layers.reshape( output, [-1, heads * hidden_size_v]) # N * (M * D2) output = fluid.layers.concat([x, output], axis=1) return output # N * (D1 * M) feat_key = fluid.layers.fc(feature, hidden_size_a * heads, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_project_key')) # N * (D2 * M) feat_value = fluid.layers.fc(feature, hidden_size_v * heads, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_project_value')) # N * (D1 * M) feat_query = fluid.layers.fc(feature, hidden_size_a * heads, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_project_query')) # N * Dm feat_gate = fluid.layers.fc(feature, hidden_size_m, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_project_gate')) # send message = gw.send( send_func, nfeat_list=[('node_feat', feature), ('feat_key', feat_key), ('feat_value', feat_value), ('feat_query', feat_query), ('feat_gate', feat_gate)], efeat_list=None, ) # recv output = gw.recv(message, recv_func) output = fluid.layers.fc(output, hidden_size_o, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_project_output')) output = fluid.layers.leaky_relu(output, alpha=0.1) output = fluid.layers.dropout(output, dropout_prob=0.1) return output
def __init__(self, hidden_size, num_steps, num_layers=1, init_scale=0.1, dropout=None): #这个模型有几个参数: #1. hidden_size,表示embedding-size,或者是记忆向量的维度 #2. num_steps,表示这个长短时记忆网络,最多可以考虑多长的时间序列 #3. num_layers,表示这个长短时记忆网络内部有多少层,我们知道, # 给定一个形状为[batch_size, seq_len, embedding_size]的输入, # 长短时记忆网络会输出一个同样为[batch_size, seq_len, embedding_size]的输出, # 我们可以把这个输出再链到一个新的长短时记忆网络上 # 如此叠加多层长短时记忆网络,有助于学习更复杂的句子甚至是篇章。 #5. init_scale,表示网络内部的参数的初始化范围, # 长短时记忆网络内部用了很多tanh,sigmoid等激活函数,这些函数对数值精度非常敏感, # 因此我们一般只使用比较小的初始化范围,以保证效果, super(SimpleLSTMRNN, self).__init__() self._hidden_size = hidden_size self._num_layers = num_layers self._init_scale = init_scale self._dropout = dropout self._input = None self._num_steps = num_steps self.cell_array = [] self.hidden_array = [] # weight_1_arr用于存储不同层的长短时记忆网络中,不同门的W参数 self.weight_1_arr = [] self.weight_2_arr = [] # bias_arr用于存储不同层的长短时记忆网络中,不同门的b参数 self.bias_arr = [] self.mask_array = [] # 通过使用create_parameter函数,创建不同长短时记忆网络层中的参数 # 通过上面的公式,我们知道,我们总共需要8个形状为[_hidden_size, _hidden_size]的W向量 # 和4个形状为[_hidden_size]的b向量,因此,我们在声明参数的时候, # 一次性声明一个大小为[self._hidden_size * 2, self._hidden_size * 4]的参数 # 和一个 大小为[self._hidden_size * 4]的参数,这样做的好处是, # 可以使用一次矩阵计算,同时计算8个不同的矩阵乘法 # 以便加快计算速度 for i in range(self._num_layers): weight_1 = self.create_parameter( attr=fluid.ParamAttr( initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)), shape=[self._hidden_size * 2, self._hidden_size * 4], dtype="float32", default_initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)) self.weight_1_arr.append(self.add_parameter('w_%d' % i, weight_1)) bias_1 = self.create_parameter( attr=fluid.ParamAttr( initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)), shape=[self._hidden_size * 4], dtype="float32", default_initializer=fluid.initializer.Constant(0.0)) self.bias_arr.append(self.add_parameter('b_%d' % i, bias_1))
def net(self, inputs, is_infer=False): # ------------------------- network input -------------------------- hist_item_seq = inputs[0] # history item sequence hist_cat_seq = inputs[1] # history category sequence target_item = inputs[2] # one dim target item target_cat = inputs[3] # one dim target category label = inputs[4] # label mask = inputs[5] # mask target_item_seq = inputs[6] # target item expand to sequence target_cat_seq = inputs[7] # traget category expand to sequence neg_hist_item_seq = inputs[8] # neg item sampling for aux loss neg_hist_cat_seq = inputs[9] # neg cat sampling for aux loss item_emb_attr = fluid.ParamAttr(name="item_emb") cur_program = fluid.Program() cur_block = cur_program.current_block() item_emb_copy = cur_block.create_var( name="item_emb", shape=[self.item_count, self.item_emb_size], dtype='float32') #item_emb_copy = fluid.layers.Print(item_emb_copy, message="Testing:") ##item_emb_attr = fluid.layers.Print(item_emb_attr, summarize=2) cat_emb_attr = fluid.ParamAttr(name="cat_emb") # ------------------------- Embedding Layer -------------------------- hist_item_emb = fluid.embedding( input=hist_item_seq, size=[self.item_count, self.item_emb_size], param_attr=item_emb_attr, is_sparse=self.is_sparse) item_emb_copy = fluid.layers.Print(item_emb_copy, message="Testing:", summarize=20, print_phase='backward') neg_hist_cat_emb = fluid.embedding( input=neg_hist_cat_seq, size=[self.cat_count, self.cat_emb_size], param_attr=cat_emb_attr, is_sparse=self.is_sparse) neg_hist_item_emb = fluid.embedding( input=neg_hist_item_seq, size=[self.item_count, self.item_emb_size], param_attr=item_emb_attr, is_sparse=self.is_sparse) hist_cat_emb = fluid.embedding( input=hist_cat_seq, size=[self.cat_count, self.cat_emb_size], param_attr=cat_emb_attr, is_sparse=self.is_sparse) target_item_emb = fluid.embedding( input=target_item, size=[self.item_count, self.item_emb_size], param_attr=item_emb_attr, is_sparse=self.is_sparse) target_cat_emb = fluid.embedding( input=target_cat, size=[self.cat_count, self.cat_emb_size], param_attr=cat_emb_attr, is_sparse=self.is_sparse) target_item_seq_emb = fluid.embedding( input=target_item_seq, size=[self.item_count, self.item_emb_size], param_attr=item_emb_attr, is_sparse=self.is_sparse) target_cat_seq_emb = fluid.embedding( input=target_cat_seq, size=[self.cat_count, self.cat_emb_size], param_attr=cat_emb_attr, is_sparse=self.is_sparse) item_b = fluid.embedding( input=target_item, size=[self.item_count, 1], param_attr=fluid.initializer.Constant(value=0.0)) # ------------------------- Interest Extractor Layer -------------------------- hist_seq_concat = fluid.layers.concat([hist_item_emb, hist_cat_emb], axis=2) neg_hist_seq_concat = fluid.layers.concat( [neg_hist_item_emb, neg_hist_cat_emb], axis=2) target_seq_concat = fluid.layers.concat( [target_item_seq_emb, target_cat_seq_emb], axis=2) target_concat = fluid.layers.concat([target_item_emb, target_cat_emb], axis=1) reshape_hist_item_emb = fluid.layers.reduce_sum(hist_seq_concat, dim=1) neg_reshape_hist_item_emb = fluid.layers.reduce_sum( neg_hist_seq_concat, dim=1) gru_input_hist_item_emb = fluid.layers.concat([reshape_hist_item_emb] * 3, axis=1) gru_h1 = fluid.layers.dynamic_gru(gru_input_hist_item_emb, size=self.item_emb_size * 2) gru_h1_input = fluid.layers.concat([gru_h1] * 3, axis=1) gru_h2 = fluid.layers.dynamic_gru(gru_h1_input, size=self.item_emb_size * 2) # ------------------------- Auxiliary loss -------------------------- pad_value = fluid.layers.zeros(shape=[1], dtype='float32') start_value = fluid.layers.zeros(shape=[1], dtype='int32') gru_out_pad, lengths = fluid.layers.sequence_pad(gru_h2, pad_value) pos_seq_pad, _ = fluid.layers.sequence_pad(reshape_hist_item_emb, pad_value) neg_seq_pad, _ = fluid.layers.sequence_pad(neg_reshape_hist_item_emb, pad_value) seq_shape = fluid.layers.shape(pos_seq_pad) if (seq_shape[1] == 1): aux_loss = 0 else: test_pos = fluid.layers.reduce_sum(fluid.layers.reduce_sum( fluid.layers.log( fluid.layers.sigmoid( fluid.layers.reduce_sum( gru_out_pad[:, start_value:seq_shape[1] - 1, :] * pos_seq_pad[:, start_value + 1:seq_shape[1], :], dim=2, keep_dim=True))), dim=2), dim=1, keep_dim=True) test_neg = fluid.layers.reduce_sum(fluid.layers.reduce_sum( fluid.layers.log( fluid.layers.sigmoid( fluid.layers.reduce_sum( gru_out_pad[:, start_value:seq_shape[1] - 1, :] * neg_seq_pad[:, start_value + 1:seq_shape[1], :], dim=2, keep_dim=True))), dim=2), dim=1, keep_dim=True) aux_loss = fluid.layers.mean(test_neg + test_pos) # ------------------------- Interest Evolving Layer (GRU with attentional input (AIGRU)) -------------------------- weighted_vector = self.din_attention(gru_out_pad, target_seq_concat, mask) weighted_vector = fluid.layers.transpose(weighted_vector, [1, 0, 2]) concat_weighted_vector = fluid.layers.concat([weighted_vector] * 3, axis=2) attention_rnn = fluid.layers.StaticRNN(name="attention_evolution") with attention_rnn.step(): word = attention_rnn.step_input(concat_weighted_vector) prev = attention_rnn.memory(shape=[-1, self.item_emb_size * 2], batch_ref=word) hidden, _, _ = fluid.layers.gru_unit(input=word, hidden=prev, size=self.item_emb_size * 6) attention_rnn.update_memory(prev, hidden) attention_rnn.output(hidden) attention_rnn_res = attention_rnn() attention_rnn_res_T = fluid.layers.transpose(attention_rnn_res, [1, 0, 2])[:, -1, :] out = fluid.layers.sequence_pool(input=hist_item_emb, pool_type='sum') out_fc = fluid.layers.fc(name="out_fc", input=out, size=self.item_emb_size + self.cat_emb_size, num_flatten_dims=1) embedding_concat = fluid.layers.concat( [attention_rnn_res_T, target_concat], axis=1) fc1 = fluid.layers.fc(name="fc1", input=embedding_concat, size=80, act=self.act) fc2 = fluid.layers.fc(name="fc2", input=fc1, size=40, act=self.act) fc3 = fluid.layers.fc(name="fc3", input=fc2, size=1) logit = fc3 + item_b loss = fluid.layers.sigmoid_cross_entropy_with_logits(x=logit, label=label) avg_loss = fluid.layers.mean(loss) + aux_loss self._cost = avg_loss self.predict = fluid.layers.sigmoid(logit) predict_2d = fluid.layers.concat([1 - self.predict, self.predict], 1) label_int = fluid.layers.cast(label, 'int64') auc_var, batch_auc_var, _ = fluid.layers.auc(input=predict_2d, label=label_int, slide_steps=0) self._metrics["AUC"] = auc_var self._metrics["BATCH_AUC"] = batch_auc_var if is_infer: self._infer_results["AUC"] = auc_var
def test_pipeline(self): x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10, 2], is_sparse=False) emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby", learning_rate=0.9), size=[10, 2], is_sparse=False) concat = layers.concat([emb_x, emb_y], axis=1) fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) loss = layers.reduce_mean(fc) optimizer = fluid.optimizer.SGD(learning_rate=0.5) optimizer = fluid.optimizer.PipelineOptimizer( optimizer, cut_list=[[emb_x, emb_y], [loss]], place_list=[ fluid.CPUPlace(), fluid.CUDAPlace(0), fluid.CPUPlace() ], concurrency_list=[1, 1, 1], queue_size=1, sync_steps=10000000, ) optimizer.minimize(loss) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) #prepare data batch_size = 100 def binary_print(slot, fout): num = np.int16(len(slot) + 1) num.tofile(fout) a = np.int64(batch_size) a.tofile(fout) slot.tofile(fout) #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1) #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1) batch1 = np.ones( (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1) batch2 = np.ones( (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1) data = [batch1, batch2] filelist = [] for i in range(2): filelist.append("test_pipeline_input_" + str(i)) for f in filelist: with open(f, "wb") as fout: for batch_data in data: for ins in batch_data: for slot in ins: binary_print(slot, fout) dataset = fluid.DatasetFactory().create_dataset("FileInstantDataset") dataset.set_use_var([x, y]) dataset.set_batch_size(batch_size) dataset.set_filelist(filelist) for epoch in range(1): exe.train_from_dataset(fluid.default_main_program(), dataset, thread=1, debug=False, fetch_list=[], fetch_info=[], print_period=1) for f in filelist: os.remove(f)
def __init__(self, hidden_size, vocab_size, class_num=2, num_layers=1, num_steps=128, init_scale=0.1, dropout=None): #这个模型的参数分别为: #1. hidden_size,表示embedding-size,hidden和cell向量的维度 #2. vocab_size,模型可以考虑的词表大小 #3. class_num,情感类型个数,可以是2分类,也可以是多分类 #4. num_steps,表示这个情感分析模型最大可以考虑的句子长度 #5. init_scale,表示网络内部的参数的初始化范围, # 长短时记忆网络内部用了很多tanh,sigmoid等激活函数,这些函数对数值精度非常敏感, # 因此我们一般只使用比较小的初始化范围,以保证效果 super(SentimentClassifier, self).__init__() self.hidden_size = hidden_size self.vocab_size = vocab_size self.class_num = class_num self.init_scale = init_scale self.num_layers = num_layers self.num_steps = num_steps self.dropout = dropout # 声明一个LSTM模型,用来把一个句子抽象城一个向量 self.simple_lstm_rnn = SimpleLSTMRNN(hidden_size, num_steps, num_layers=num_layers, init_scale=init_scale, dropout=dropout) # 声明一个embedding层,用来把句子中的每个词转换为向量 self.embedding = Embedding( size=[vocab_size, hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale))) # 在得到一个句子的向量表示后,我们需要根据这个向量表示对这个句子进行分类 # 一般来说,我们可以把这个句子的向量表示, # 乘以一个大小为[self.hidden_size, self.class_num]的W参数 # 并加上一个大小为[self.class_num]的b参数 # 通过这种手段达到把句子向量映射到分类结果的目标 # 我们需要声明最终在使用句子向量映射到具体情感类别过程中所需要使用的参数 # 这个参数的大小一般是[self.hidden_size, self.class_num] self.softmax_weight = self.create_parameter( attr=fluid.ParamAttr(), shape=[self.hidden_size, self.class_num], dtype="float32", default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)) # 同样的,我们需要声明最终分类过程中的b参数 # 这个参数的大小一般是[self.class_num] self.softmax_bias = self.create_parameter( attr=fluid.ParamAttr(), shape=[self.class_num], dtype="float32", default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale))
def single_section(self, random_dump): program = fluid.Program() with fluid.program_guard(program): x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0) y = fluid.layers.data(name='y', shape=[1], dtype='int64', lod_level=0) emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10, 2], is_sparse=False) emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr( name="emby", learning_rate=0.9), size=[10, 2], is_sparse=False) concat = layers.concat([emb_x, emb_y], axis=1) fc = layers.fc(input=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) loss = layers.reduce_mean(fc) optimizer = fluid.optimizer.SGD(learning_rate=0.5) optimizer = fluid.optimizer.PipelineOptimizer( optimizer, cut_list=[], #place_list=[fluid.CPUPlace()], place_list=[fluid.CUDAPlace(0)], concurrency_list=[1], queue_size=1, sync_steps=-1) optimizer.minimize(loss) program._pipeline_opt["dump_fields"] = [ "fc.tmp_0", "fc.tmp_0@GRAD" ] program._pipeline_opt["dump_fields_path"] = "./dump_log/" program._pipeline_opt["dump_param"] = ["embx"] program._pipeline_opt["enable_random_dump"] = random_dump program._pipeline_opt["dump_interval"] = 10 program._pipeline_opt["random_with_lineid"] = False #print(program._pipeline_opt) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) #prepare data batch_size = 100 def binary_print(slot, fout): num = np.int16(len(slot) + 1) num.tofile(fout) a = np.int64(batch_size) a.tofile(fout) slot.tofile(fout) #batch1 = np.array([[0,1], [1,2], [2,3]]).astype("int64").reshape(batch_size,2,1) #batch2 = np.array([[1,2], [2,3], [3,4]]).astype("int64").reshape(batch_size,2,1) batch1 = np.ones( (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1) batch2 = np.ones( (batch_size, 2, 1)).astype("int64").reshape(batch_size, 2, 1) data = [batch1, batch2] filelist = [] for i in range(2): filelist.append("test_pipeline_input_" + str(i)) for f in filelist: with open(f, "wb") as fout: for batch_data in data: for ins in batch_data: for slot in ins: binary_print(slot, fout) dataset = fluid.DatasetFactory().create_dataset( "FileInstantDataset") dataset.set_use_var([x, y]) dataset.set_batch_size(batch_size) dataset.set_filelist(filelist) for epoch in range(1): exe.train_from_dataset(fluid.default_main_program(), dataset, thread=1, debug=True, fetch_list=[], fetch_info=[], print_period=1) for f in filelist: os.remove(f) if os.path.isdir("dump_log"): shutil.rmtree("dump_log")
def create_vcr_model(pyreader_name, ernie_config, task_group, is_prediction=False): """ create model arc for vcr tasks """ shapes = [ [-1, args.max_seq_len, 1], #src_id [-1, args.max_seq_len, 1], #pos_id [-1, args.max_seq_len, 1], #sent_id [-1, args.max_seq_len, 1], #task_id [-1, args.max_seq_len, 1], #input_mask [-1, args.max_img_len, args.feature_size], #image_embedding [-1, args.max_img_len, 5], #image_loc [-1, args.max_img_len, 1], #image_mask [-1, 1], #labels [-1, 1], #qids [], #task_index [-1, 1], #binary_labels ] dtypes = [ 'int64', 'int64', 'int64', 'int64', 'float32', 'float32', 'float32', 'float32', 'int64', 'int64', 'int64', 'float32' ] lod_levels = [0] * len(dtypes) for _ in task_group: shapes.append([]) dtypes.append('float') lod_levels.append(0) pyreader = fluid.layers.py_reader(capacity=30, shapes=shapes, dtypes=dtypes, lod_levels=lod_levels, name=pyreader_name, use_double_buffer=False) inputs = fluid.layers.read_file(pyreader) src_ids, pos_ids, sent_ids, task_ids, input_mask, image_embeddings, \ image_loc, image_mask, labels, q_ids, task_index, binary_labels = inputs[: 12] ernie_vil = ErnieVilModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, task_ids=task_ids, input_mask=input_mask, image_embeddings=image_embeddings, image_loc=image_loc, input_image_mask=image_mask, config=ernie_config) h_cls, h_img = ernie_vil.get_pooled_output() task_conf = task_group[0] fusion_method = task_conf["fusion_method"] fusion_fea = ernie_vil.get_match_score(text=h_cls, image=h_img, \ dropout_rate=task_conf["dropout_rate"], mode=fusion_method) if is_prediction: num_choice = int(task_conf['num_choice']) task_name = task_conf.get('task_prefix', 'vcr') score = fluid.layers.fc( fusion_fea, 1, param_attr=fluid.ParamAttr( name=task_name + "_fc.w_0", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=task_name + "_fc.b_0") score = fluid.layers.reshape(score, shape=[-1, num_choice]) _loss, _softmax = fluid.layers.softmax_with_cross_entropy( logits=score, label=labels, return_softmax=True) _acc = fluid.layers.accuracy(input=_softmax, label=labels) pred = fluid.layers.argmax(score, axis=1) mean_loss = fluid.layers.mean(_loss) task_vars = [mean_loss, _acc, pred, q_ids, labels, score] #_softmax for var in task_vars: var.persistable = True return pyreader, task_vars else: start_ind = 12 mean_loss = fluid.layers.zeros(shape=[1], dtype='float32') mean_acc = fluid.layers.zeros(shape=[1], dtype='float32') for task_conf in task_group: task_weight = inputs[start_ind] start_ind += 1 num_choice = int(task_conf['num_choice']) task_name = task_conf.get('task_prefix', 'vcr') score = fluid.layers.fc( fusion_fea, 1, param_attr=fluid.ParamAttr( name=task_name + "_fc.w_0", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=task_name + "_fc.b_0") _loss = fluid.layers.sigmoid_cross_entropy_with_logits( score, binary_labels, name="cross_entropy_loss") tmp_score = fluid.layers.reshape(score, shape=[-1, num_choice]) _softmax = fluid.layers.softmax(tmp_score) _acc = fluid.layers.accuracy(input=_softmax, label=labels) _mean_loss = fluid.layers.mean(_loss) mean_loss += _mean_loss * task_weight mean_acc += _acc * task_weight # Added score & labels for roc_auc task_vars = [ fluid.layers.reduce_mean(mean_loss), mean_acc, score, binary_labels ] for var in task_vars: var.persistable = True return pyreader, task_vars
def create_model(args, bert_config, num_labels, is_prediction=False): input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'labels'], 'shapes': [[None, None], [None, None], [None, None], [None, None, 1], [None, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } inputs = [ fluid.data( name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] (src_ids, pos_ids, sent_ids, input_mask, labels) = inputs data_loader = fluid.io.DataLoader.from_generator( feed_list=inputs, capacity=50, iterable=False) bert = BertModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) cls_feats = bert.get_pooled_output() cls_feats = fluid.layers.dropout( x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, num_flatten_dims=2, size=num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr( name="cls_out_b", initializer=fluid.initializer.Constant(0.))) if is_prediction: probs = fluid.layers.softmax(logits) feed_targets_name = [ src_ids.name, pos_ids.name, sent_ids.name, input_mask.name ] return data_loader, probs, feed_targets_name logits = fluid.layers.reshape(logits, [-1, num_labels], inplace=True) ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) loss = fluid.layers.mean(x=ce_loss) num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) return data_loader, loss, probs, accuracy, num_seqs
def create_model(args, pyreader_name, ernie_config, is_prediction=False): pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64'], lod_levels=[0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, sent_ids, pos_ids, input_mask, labels, qids) = fluid.layers.read_file(pyreader) ernie = ErnieModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=ernie_config, use_fp16=args.use_fp16) cls_feats = ernie.get_pooled_output() cls_feats = fluid.layers.dropout(x=cls_feats, dropout_prob=0.1, dropout_implementation="upscale_in_train") logits = fluid.layers.fc( input=cls_feats, size=args.num_labels, param_attr=fluid.ParamAttr( name="cls_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr(name="cls_out_b", initializer=fluid.initializer.Constant(0.))) if is_prediction: probs = fluid.layers.softmax(logits) feed_targets_name = [ src_ids.name, pos_ids.name, sent_ids.name, input_mask.name ] return pyreader, probs, feed_targets_name ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logits, label=labels, return_softmax=True) loss = fluid.layers.mean(x=ce_loss) if args.use_fp16 and args.loss_scaling > 1.0: loss *= args.loss_scaling num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) graph_vars = { "loss": loss, "probs": probs, "accuracy": accuracy, "labels": labels, "num_seqs": num_seqs, "qids": qids } for k, v in graph_vars.items(): v.persistable = True return pyreader, graph_vars
def create_model(bert_config, is_training=False): if is_training: input_fields = { 'names': [ 'src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'start_positions', 'end_positions' ], 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64', 'int64'], 'lod_levels': [0, 0, 0, 0, 0, 0], } else: input_fields = { 'names': ['src_ids', 'pos_ids', 'sent_ids', 'input_mask', 'unique_id'], 'shapes': [[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1]], 'dtypes': ['int64', 'int64', 'int64', 'float32', 'int64'], 'lod_levels': [0, 0, 0, 0, 0], } inputs = [ fluid.layers.data(name=input_fields['names'][i], shape=input_fields['shapes'][i], dtype=input_fields['dtypes'][i], lod_level=input_fields['lod_levels'][i]) for i in range(len(input_fields['names'])) ] pyreader = fluid.io.PyReader(feed_list=inputs, capacity=50, iterable=False) if is_training: (src_ids, pos_ids, sent_ids, input_mask, start_positions, end_positions) = inputs else: (src_ids, pos_ids, sent_ids, input_mask, unique_id) = inputs bert = BertModel(src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) enc_out = bert.get_sequence_output() logits = fluid.layers.fc( input=enc_out, size=2, num_flatten_dims=2, param_attr=fluid.ParamAttr( name="cls_squad_out_w", initializer=fluid.initializer.TruncatedNormal(scale=0.02)), bias_attr=fluid.ParamAttr(name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) batch_ones = fluid.layers.fill_constant_batch_size_like(input=start_logits, dtype='int64', shape=[1], value=1) num_seqs = fluid.layers.reduce_sum(input=batch_ones) if is_training: def compute_loss(logits, positions): loss = fluid.layers.softmax_with_cross_entropy(logits=logits, label=positions) loss = fluid.layers.mean(x=loss) return loss start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 if args.use_fp16 and args.loss_scaling > 1.0: total_loss = total_loss * args.loss_scaling return pyreader, total_loss, num_seqs else: return pyreader, unique_id, start_logits, end_logits, num_seqs
def linear(inputs, para_name, args): return layers.fc(input=inputs, size=size, param_attr=fluid.ParamAttr(name=para_name + '_w'), bias_attr=fluid.ParamAttr(name=para_name + '_b'))
def __init__(self, args, pretrained_embed=None): super(Model, self).__init__() self.args = args # the embedding layer self.word_embed = dygraph.Embedding(size=(args.n_words, args.n_embed)) if args.pretrained_embed_shape is not None: if pretrained_embed is not None: pre_param_attrs = fluid.ParamAttr( name="pretrained_emb", initializer=initializer.NumpyArrayInitializer( pretrained_embed), trainable=True) self.pretrained = dygraph.Embedding( size=args.pretrained_embed_shape, param_attr=pre_param_attrs) self.word_embed.weight = layers.create_parameter( shape=(self.args.n_words, self.args.n_embed), dtype='float32', default_initializer=initializer.Constant(value=0.0)) else: self.pretrained = dygraph.Embedding( size=args.pretrained_embed_shape) # Initialize feat feature, feat can be char or pos if args.feat == 'char': self.feat_embed = CharLSTM(n_chars=args.n_feats, n_embed=args.n_char_embed, n_out=args.n_feat_embed, pad_index=args.feat_pad_index) else: self.feat_embed = dygraph.Embedding(size=(args.n_feats, args.n_feat_embed)) self.embed_dropout = IndependentDropout(p=args.embed_dropout) # lstm layer self.lstm = BiLSTM(input_size=args.n_embed + args.n_feat_embed, hidden_size=args.n_lstm_hidden, num_layers=args.n_lstm_layers, dropout=args.lstm_dropout) self.lstm_dropout = SharedDropout(p=args.lstm_dropout) # mlp layer self.mlp_arc_h = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_arc_d = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_arc, dropout=args.mlp_dropout) self.mlp_rel_h = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_rel, dropout=args.mlp_dropout) self.mlp_rel_d = MLP(n_in=args.n_lstm_hidden * 2, n_out=args.n_mlp_rel, dropout=args.mlp_dropout) # biaffine layers self.arc_attn = Biaffine(n_in=args.n_mlp_arc, bias_x=True, bias_y=False) self.rel_attn = Biaffine(n_in=args.n_mlp_rel, n_out=args.n_rels, bias_x=True, bias_y=True) self.pad_index = args.pad_index self.unk_index = args.unk_index
def gin(gw, feature, hidden_size, activation, name, init_eps=0.0, train_eps=False): """Implementation of Graph Isomorphism Network (GIN) layer. This is an implementation of the paper How Powerful are Graph Neural Networks? (https://arxiv.org/pdf/1810.00826.pdf). In their implementation, all MLPs have 2 layers. Batch normalization is applied on every hidden layer. Args: gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). name: GIN layer names. hidden_size: The hidden size for gin. activation: The activation for the output. init_eps: float, optional Initial :math:`\epsilon` value, default is 0. train_eps: bool, optional if True, :math:`\epsilon` will be a learnable parameter. Return: A tensor with shape (num_nodes, hidden_size). """ def send_src_copy(src_feat, dst_feat, edge_feat): return src_feat["h"] epsilon = fluid.layers.create_parameter( shape=[1, 1], dtype="float32", attr=fluid.ParamAttr(name="%s_eps" % name), default_initializer=fluid.initializer.ConstantInitializer( value=init_eps)) if not train_eps: epsilon.stop_gradient = True msg = gw.send(send_src_copy, nfeat_list=[("h", feature)]) output = gw.recv(msg, "sum") + feature * (epsilon + 1.0) output = fluid.layers.fc(output, size=hidden_size, act=None, param_attr=fluid.ParamAttr(name="%s_w_0" % name), bias_attr=fluid.ParamAttr(name="%s_b_0" % name)) output = fluid.layers.layer_norm( output, begin_norm_axis=1, param_attr=fluid.ParamAttr( name="norm_scale_%s" % (name), initializer=fluid.initializer.Constant(1.0)), bias_attr=fluid.ParamAttr(name="norm_bias_%s" % (name), initializer=fluid.initializer.Constant(0.0)), ) if activation is not None: output = getattr(fluid.layers, activation)(output) output = fluid.layers.fc(output, size=hidden_size, act=activation, param_attr=fluid.ParamAttr(name="%s_w_1" % name), bias_attr=fluid.ParamAttr(name="%s_b_1" % name)) return output
def multi_head_attention(queries, keys, values, attn_bias, d_key, d_value, d_model, n_head=1, dropout_rate=0., cache=None, param_initializer=None, name='multi_head_att'): """ Multi-Head Attention. Note that attn_bias is added to the logit before computing softmax activiation to mask certain selected positions so that they will not considered in attention weights. """ keys = queries if keys is None else keys values = keys if values is None else values if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): raise ValueError( "Inputs: quries, keys and values should all be 3-D tensors.") def __compute_qkv(queries, keys, values, n_head, d_key, d_value): """ Add linear projection to queries, keys, and values. """ q = layers.fc(input=queries, size=d_key * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_query_fc.w_0', initializer=param_initializer), bias_attr=name + '_query_fc.b_0') k = layers.fc(input=keys, size=d_key * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_key_fc.w_0', initializer=param_initializer), bias_attr=name + '_key_fc.b_0') v = layers.fc(input=values, size=d_value * n_head, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_value_fc.w_0', initializer=param_initializer), bias_attr=name + '_value_fc.b_0') return q, k, v def __split_heads(x, n_head): """ Reshape the last dimension of inpunt tensor x so that it becomes two dimensions and then transpose. Specifically, input a tensor with shape [bs, max_sequence_length, n_head * hidden_dim] then output a tensor with shape [bs, n_head, max_sequence_length, hidden_dim]. """ hidden_size = x.shape[-1] # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. reshaped = layers.reshape(x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=True) # permuate the dimensions into: # [batch_size, n_head, max_sequence_len, hidden_size_per_head] return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) def __combine_heads(x): """ Transpose and then reshape the last two dimensions of inpunt tensor x so that it becomes one dimension, which is reverse to __split_heads. """ if len(x.shape) == 3: return x if len(x.shape) != 4: raise ValueError("Input(x) should be a 4-D Tensor.") trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) # The value 0 in shape attr means copying the corresponding dimension # size of the input as the output dimension size. return layers.reshape( x=trans_x, shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], inplace=True) def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): """ Scaled Dot-Product Attention """ scaled_q = layers.scale(x=q, scale=d_key**-0.5) product = layers.matmul(x=scaled_q, y=k, transpose_y=True) if attn_bias: product += attn_bias weights = layers.softmax(product, use_cudnn=True) if dropout_rate: weights = layers.dropout(weights, dropout_prob=dropout_rate, dropout_implementation="upscale_in_train", is_test=False) out = layers.matmul(weights, v) return out q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) if cache is not None: # use cache and concat time steps # Since the inplace reshape in __split_heads changes the shape of k and # v, which is the cache input for next time step, reshape the cache # input from the previous time step first. k = cache["k"] = layers.concat( [layers.reshape(cache["k"], shape=[0, 0, d_model]), k], axis=1) v = cache["v"] = layers.concat( [layers.reshape(cache["v"], shape=[0, 0, d_model]), v], axis=1) q = __split_heads(q, n_head) k = __split_heads(k, n_head) v = __split_heads(v, n_head) ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate) out = __combine_heads(ctx_multiheads) # Project back to the model size. proj_out = layers.fc(input=out, size=d_model, num_flatten_dims=2, param_attr=fluid.ParamAttr( name=name + '_output_fc.w_0', initializer=param_initializer), bias_attr=name + '_output_fc.b_0') return proj_out
def gat(gw, feature, hidden_size, activation, name, num_heads=8, feat_drop=0.6, attn_drop=0.6, is_test=False): """Implementation of graph attention networks (GAT) This is an implementation of the paper GRAPH ATTENTION NETWORKS (https://arxiv.org/abs/1710.10903). Args: gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). hidden_size: The hidden size for gat. activation: The activation for the output. name: Gat layer names. num_heads: The head number in gat. feat_drop: Dropout rate for feature. attn_drop: Dropout rate for attention. is_test: Whether in test phrase. Return: A tensor with shape (num_nodes, hidden_size * num_heads) """ def send_attention(src_feat, dst_feat, edge_feat): output = src_feat["left_a"] + dst_feat["right_a"] output = fluid.layers.leaky_relu(output, alpha=0.2) # (num_edges, num_heads) return {"alpha": output, "h": src_feat["h"]} def reduce_attention(msg): alpha = msg["alpha"] # lod-tensor (batch_size, seq_len, num_heads) h = msg["h"] alpha = paddle_helper.sequence_softmax(alpha) old_h = h h = fluid.layers.reshape(h, [-1, num_heads, hidden_size]) alpha = fluid.layers.reshape(alpha, [-1, num_heads, 1]) if attn_drop > 1e-15: alpha = fluid.layers.dropout( alpha, dropout_prob=attn_drop, is_test=is_test, dropout_implementation="upscale_in_train") h = h * alpha h = fluid.layers.reshape(h, [-1, num_heads * hidden_size]) h = fluid.layers.lod_reset(h, old_h) return fluid.layers.sequence_pool(h, "sum") if feat_drop > 1e-15: feature = fluid.layers.dropout( feature, dropout_prob=feat_drop, is_test=is_test, dropout_implementation='upscale_in_train') ft = fluid.layers.fc(feature, hidden_size * num_heads, bias_attr=False, param_attr=fluid.ParamAttr(name=name + '_weight')) left_a = fluid.layers.create_parameter(shape=[num_heads, hidden_size], dtype='float32', name=name + '_gat_l_A') right_a = fluid.layers.create_parameter(shape=[num_heads, hidden_size], dtype='float32', name=name + '_gat_r_A') reshape_ft = fluid.layers.reshape(ft, [-1, num_heads, hidden_size]) left_a_value = fluid.layers.reduce_sum(reshape_ft * left_a, -1) right_a_value = fluid.layers.reduce_sum(reshape_ft * right_a, -1) msg = gw.send(send_attention, nfeat_list=[("h", ft), ("left_a", left_a_value), ("right_a", right_a_value)]) output = gw.recv(msg, reduce_attention) bias = fluid.layers.create_parameter(shape=[hidden_size * num_heads], dtype='float32', is_bias=True, name=name + '_bias') bias.stop_gradient = True output = fluid.layers.elementwise_add(output, bias, act=activation) return output
import numpy as np import paddle.fluid as fluid import paddle.fluid.layers as layers # Program to generate parameter # The original 'weight' is filled value 1 with shape (4, 8) ones = np.ones((4, 8)).astype('float32') main_prog = fluid.Program() start_prog = fluid.Program() with fluid.program_guard(main_prog, start_prog): input = fluid.data('input', shape=[-1, 4], dtype='float32') output = layers.fc( input, 8, param_attr=fluid.ParamAttr( name='weight', initializer=fluid.initializer.NumpyArrayInitializer(ones))) exe = fluid.Executor(fluid.CPUPlace()) # initialize all parameters exe.run(start_prog) # simulate saving model fluid.io.save_persistables(exe, dirname="old", main_program=main_prog) ############################################################################# # The following section illustrates what user should do to adjust parameter # ############################################################################# # The target 'weight' is the concatenation of original 'weight' and a # supplement weight filled 0 of shape (4, 8)
def compute_span_end_logits(input_tensor, span_mask, flat_start_positions, args, name=""): input_shape = list(input_tensor.shape) span_mask_shape = list(span_mask.shape) batch_size = args.start_top_k * args.batch_size seq_length = span_mask_shape[1] width = input_shape[-1] start_vectors = gather_indexes(input_tensor, flat_start_positions) start_vectors = fluid.layers.reshape(x=start_vectors, shape=[-1, 1, width]) start_vectors = fluid.layers.expand(x=start_vectors, expand_times=[1, seq_length, 1]) concat_input = fluid.layers.concat(input=[start_vectors, input_tensor], axis=2) weights = fluid.ParamAttr(name=name + "conditional_fc_weights", initializer=create_initializer(0.02)) bias = fluid.ParamAttr(name=name + "conditional_fc_bias") concat_input_reshape = fluid.layers.reshape(x=concat_input, shape=[-1, 2 * width]) conditional_tensor = fluid.layers.fc(input=concat_input_reshape, size=width, act="gelu", name=name + "span_end_conditional", param_attr=weights, bias_attr=bias) conditional_tensor_reshape = fluid.layers.reshape( x=conditional_tensor, shape=[-1, seq_length, width]) conditional_tensor = fluid.layers.layer_norm( input=conditional_tensor_reshape, begin_norm_axis=2, param_attr=fluid.ParamAttr(name=name + "conditional_layernorm_gamma", initializer=create_initializer(0.02)), bias_attr=fluid.ParamAttr(name=name + "conditional_layernorm_beta")) end_weights = fluid.layers.create_parameter( name=name + "span_end_weights", shape=[width], dtype='float32', default_initializer=create_initializer(0.02)) template_var = fluid.layers.fill_constant_batch_size_like( conditional_tensor, shape=list(conditional_tensor.shape), dtype='float32', value=0) end_weights = fluid.layers.reshape(x=end_weights, shape=[1, width]) end_weights = fluid.layers.expand(x=end_weights, expand_times=[seq_length, 1]) end_weights = fluid.layers.elementwise_add(template_var, end_weights, axis=-1) raw_scores = fluid.layers.reduce_sum(conditional_tensor * end_weights, dim=-1) raw_scores += (1.0 - fluid.layers.cast(x=span_mask, dtype='float32')) * -10000.0 logits = fluid.layers.reshape(x=raw_scores, shape=[-1, seq_length]) return logits
def get_pretraining_output(self, mask_label, mask_pos, labels): """Get the loss & accuracy for pretraining""" mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') # extract the first token feature in each sentence next_sent_feat = self.get_pooled_output() reshaped_emb_out = fluid.layers.reshape(x=self._enc_out, shape=[-1, self._emb_size]) # extract masked tokens' feature mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) # transform: fc mask_trans_feat = fluid.layers.fc( input=mask_feat, size=self._emb_size, act=self._hidden_act, param_attr=fluid.ParamAttr(name=self.model_name + 'mask_lm_trans_fc.w_0', initializer=self._param_initializer), bias_attr=fluid.ParamAttr(name=self.model_name + 'mask_lm_trans_fc.b_0')) # transform: layer norm mask_trans_feat = pre_process_layer(mask_trans_feat, 'n', name=self.model_name + 'mask_lm_trans') mask_lm_out_bias_attr = fluid.ParamAttr( name=self.model_name + "mask_lm_out_fc.b_0", initializer=fluid.initializer.Constant(value=0.0)) if self._weight_sharing: fc_out = fluid.layers.matmul( x=mask_trans_feat, y=fluid.default_main_program().global_block().var( self._word_emb_name), transpose_y=True) fc_out += fluid.layers.create_parameter(shape=[self._voc_size], dtype=self._dtype, attr=mask_lm_out_bias_attr, is_bias=True) else: fc_out = fluid.layers.fc( input=mask_trans_feat, size=self._voc_size, param_attr=fluid.ParamAttr( name=self.model_name + "mask_lm_out_fc.w_0", initializer=self._param_initializer), bias_attr=mask_lm_out_bias_attr) mask_lm_loss = fluid.layers.softmax_with_cross_entropy( logits=fc_out, label=mask_label) mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss) next_sent_fc_out = fluid.layers.fc( input=next_sent_feat, size=2, param_attr=fluid.ParamAttr(name=self.model_name + "next_sent_fc.w_0", initializer=self._param_initializer), bias_attr=self.model_name + "next_sent_fc.b_0") next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( logits=next_sent_fc_out, label=labels, return_softmax=True) next_sent_acc = fluid.layers.accuracy(input=next_sent_softmax, label=labels) mean_next_sent_loss = fluid.layers.mean(next_sent_loss) loss = mean_next_sent_loss + mean_mask_lm_loss return next_sent_acc, mean_mask_lm_loss, loss
def train(use_cuda, save_dirname=None, is_local=True): # define network topology word = fluid.layers.data( name='word_data', shape=[1], dtype='int64', lod_level=1) predicate = fluid.layers.data( name='verb_data', shape=[1], dtype='int64', lod_level=1) ctx_n2 = fluid.layers.data( name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) ctx_n1 = fluid.layers.data( name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) ctx_0 = fluid.layers.data( name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) ctx_p1 = fluid.layers.data( name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) ctx_p2 = fluid.layers.data( name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) mark = fluid.layers.data( name='mark_data', shape=[1], dtype='int64', lod_level=1) feature_out = db_lstm(**locals()) target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) # TODO(qiao) # check other optimizers and check why out will be NAN sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True)) sgd_optimizer.minimize(avg_cost) # TODO(qiao) # add dependency track and move this config before optimizer crf_decode = fluid.layers.crf_decoding( input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) train_data = paddle.batch( paddle.reader.shuffle( paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder( feed_list=[ word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target ], place=place) exe = fluid.Executor(place) def train_loop(main_program): exe.run(fluid.default_startup_program()) embedding_param = fluid.global_scope().find_var( embedding_name).get_tensor() embedding_param.set( load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place) start_time = time.time() batch_id = 0 for pass_id in range(PASS_NUM): for data in train_data(): cost = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) cost = cost[0] if batch_id % 10 == 0: print("avg_cost:" + str(cost)) if batch_id != 0: print("second per batch: " + str((time.time( ) - start_time) / batch_id)) # Set the threshold low to speed up the CI test if float(cost) < 80.0: if save_dirname is not None: # TODO(liuyiqun): Change the target to crf_decode fluid.io.save_inference_model(save_dirname, [ 'word_data', 'verb_data', 'ctx_n2_data', 'ctx_n1_data', 'ctx_0_data', 'ctx_p1_data', 'ctx_p2_data', 'mark_data' ], [feature_out], exe) return batch_id = batch_id + 1 raise RuntimeError( "This model should save_inference_model and return, but not reach here, please check!" ) if is_local: train_loop(fluid.default_main_program()) else: port = os.getenv("PADDLE_PSERVER_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVER_IPS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": train_loop(t.get_trainer_program())
import numpy as np import paddle.fluid as fluid from paddle.fluid.dygraph.nn import Conv2D from paddle.fluid.initializer import NumpyArrayInitializer img = Image.open(r'D:\softwaresavfile\Github\machine_learning\计算视觉基础\test.jpg') with fluid.dygraph.guard(): # 设置卷积核参数 w = np.array([[-1,-1,-1], [-1,8,-1], [-1,-1,-1]], dtype='float32')/8 w = w.reshape([1, 1, 3, 3]) # 由于输入通道数是3,将卷积核的形状从[1,1,3,3]调整为[1,3,3,3] w = np.repeat(w, 3, axis=1) # 创建卷积算子,输出通道数为1,卷积核大小为3x3, # 并使用上面的设置好的数值作为卷积核权重的初始化参数 conv = Conv2D(num_channels=3, num_filters=1, filter_size=[3, 3], param_attr=fluid.ParamAttr( initializer=NumpyArrayInitializer(value=w))) # 将读入的图片转化为float32类型的numpy.ndarray x = np.array(img).astype('float32') # 图片读入成ndarry时,形状是[H, W, 3], # 将通道这一维度调整到最前面 x = np.transpose(x, (2,0,1)) print("图像的高度和宽度=",img.height, img.width) # 将数据形状调整为[N, C, H, W]格式 x = x.reshape(1, 3, img.height, img.width) x = fluid.dygraph.to_variable(x) y = conv(x) out = y.numpy() plt.figure(figsize=(20, 10))
def train(conf_dict, data_reader, use_cuda=False): """ Training of so labeling model """ # input data layer word = fluid.layers.data( name='word_data', shape=[1], dtype='int64', lod_level=1) postag = fluid.layers.data( name='token_pos', shape=[1], dtype='int64', lod_level=1) p_word = fluid.layers.data( name='p_word', shape=[1], dtype='int64', lod_level=1) # label target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) # embedding + lstm feature_out = spo_model.db_lstm(data_reader, word, \ postag, p_word, conf_dict) # loss function # crf layer mix_hidden_lr = float(conf_dict['mix_hidden_lr']) crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(crf_cost) # optimizer sgd_optimizer = fluid.optimizer.AdamOptimizer( learning_rate=2e-3, ) sgd_optimizer.minimize(avg_cost) crf_decode = fluid.layers.crf_decoding( input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) train_batch_reader = paddle.batch( paddle.reader.shuffle(data_reader.get_train_reader(), buf_size=8192), batch_size=conf_dict['batch_size']) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[word, postag, p_word, target], place=place) exe = fluid.Executor(place) save_dirname = conf_dict['spo_model_save_dir'] def train_loop(main_program, trainer_id=0): """start train loop""" exe.run(fluid.default_startup_program()) start_time = time.time() batch_id = 0 for pass_id in six.moves.xrange(conf_dict['pass_num']): pass_start_time = time.time() cost_sum, cost_counter = 0, 0 for data in train_batch_reader(): cost = exe.run(main_program, feed=feeder.feed(data), fetch_list=[avg_cost]) cost = cost[0] cost_sum += cost cost_counter += 1 if batch_id % 10 == 0 and batch_id != 0: #sys.stderr.write("batch %d finished, second per batch: %02f\n" % ( # batch_id, (time.time() - start_time) / batch_id)) # cost expected, training over if float(cost) < 1: save_path = os.path.join(save_dirname, 'final') fluid.io.save_inference_model(save_path, ['word_data', 'token_dist', 'p_word'], [feature_out], exe, params_filename='params') return batch_id = batch_id + 1 # save the model once each pass ends pass_avg_cost = cost_sum / cost_counter if cost_counter > 0 else 0.0 #sys.stderr.write("%d pass end, cost time: %02f, avg_cost: %f" % ( # pass_id, time.time() - pass_start_time, pass_avg_cost)) save_path = os.path.join(save_dirname, 'pass_%04d-%f' % (pass_id, pass_avg_cost)) fluid.io.save_inference_model(save_path, ['word_data', 'token_pos', 'p_word'], [feature_out], exe, params_filename='params') else: # pass times complete and the training is over save_path = os.path.join(save_dirname, 'final') fluid.io.save_inference_model(save_path, ['word_data', 'token_pos', 'p_word'], [feature_out], exe, params_filename='params') return train_loop(fluid.default_main_program()) def main(conf_dict, use_cuda=False): """Train main function""" if use_cuda and not fluid.core.is_compiled_with_cuda(): return data_generator = spo_data_reader.DataReader( wordemb_dict_path=conf_dict['word_idx_path'], postag_dict_path=conf_dict['postag_dict_path'], label_dict_path=conf_dict['so_label_dict_path'], p_eng_dict_path=conf_dict['label_dict_path'], train_data_list_path=conf_dict['spo_train_data_path'], test_data_list_path=conf_dict['spo_test_data_path']) train(conf_dict, data_generator, use_cuda=use_cuda) if __name__ == '__main__': # Load the configuration file parser = argparse.ArgumentParser() parser.add_argument("--conf_path", type=str, help="conf_file_path_for_model. (default: %(default)s)", required=True) args = parser.parse_args() conf_dict = conf_lib.load_conf(args.conf_path) use_gpu = True if conf_dict.get('use_gpu', 'False') == 'True' else False main(conf_dict, use_cuda=use_gpu)
def _build_model(self, src_ids, position_ids, sentence_ids, input_mask): # padding id in vocabulary must be set to 0 emb_out = fluid.layers.embedding( input=src_ids, size=[self._voc_size, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._word_emb_name, initializer=self._param_initializer), is_sparse=False) position_emb_out = fluid.layers.embedding( input=position_ids, size=[self._max_position_seq_len, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._pos_emb_name, initializer=self._param_initializer)) sent_emb_out = fluid.layers.embedding( sentence_ids, size=[self._sent_types, self._emb_size], dtype=self._dtype, param_attr=fluid.ParamAttr(name=self._sent_emb_name, initializer=self._param_initializer)) emb_out = emb_out + position_emb_out emb_out = emb_out + sent_emb_out emb_out = pre_process_layer(emb_out, 'nd', self._prepostprocess_dropout, name='pre_encoder') if self._dtype == core.VarDesc.VarType.FP16: input_mask = fluid.layers.cast(x=input_mask, dtype=self._dtype) self_attn_mask = fluid.layers.matmul(x=input_mask, y=input_mask, transpose_y=True) self_attn_mask = fluid.layers.scale(x=self_attn_mask, scale=10000.0, bias=-1.0, bias_after_scale=False) n_head_self_attn_mask = fluid.layers.stack(x=[self_attn_mask] * self._n_head, axis=1) n_head_self_attn_mask.stop_gradient = True self._enc_out = encoder( enc_input=emb_out, attn_bias=n_head_self_attn_mask, n_layer=self._n_layer, n_head=self._n_head, d_key=self._emb_size // self._n_head, d_value=self._emb_size // self._n_head, d_model=self._emb_size, d_inner_hid=self._emb_size * 4, prepostprocess_dropout=self._prepostprocess_dropout, attention_dropout=self._attention_dropout, relu_dropout=0, hidden_act=self._hidden_act, preprocess_cmd="", postprocess_cmd="dan", param_initializer=self._param_initializer, name='encoder')
def create_model(pyreader_name, bert_config, max_wn_concept_length, max_nell_concept_length, wn_concept_embedding_mat, nell_concept_embedding_mat, is_training=False, freeze=False): if is_training: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, max_wn_concept_length, 1], [-1, args.max_seq_len, max_nell_concept_length, 1], [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]], dtypes=[ 'int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64', 'int64'], lod_levels=[0, 0, 0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, start_positions, end_positions) = fluid.layers.read_file(pyreader) else: pyreader = fluid.layers.py_reader( capacity=50, shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, args.max_seq_len, max_wn_concept_length, 1], [-1, args.max_seq_len, max_nell_concept_length, 1], [-1, args.max_seq_len, 1], [-1, 1]], dtypes=['int64', 'int64', 'int64', 'int64', 'int64', 'float32', 'int64'], lod_levels=[0, 0, 0, 0, 0, 0, 0], name=pyreader_name, use_double_buffer=True) (src_ids, pos_ids, sent_ids, wn_concept_ids, nell_concept_ids, input_mask, unique_id) = fluid.layers.read_file(pyreader) '''1st Layer: BERT Layer''' bert = BertModel( src_ids=src_ids, position_ids=pos_ids, sentence_ids=sent_ids, input_mask=input_mask, config=bert_config, use_fp16=args.use_fp16) enc_out = bert.get_sequence_output() if freeze: enc_out.stop_gradient=True logger.info("enc_out.stop_gradient: {}".format(enc_out.stop_gradient)) '''2nd layer: Memory Layer''' # get memory embedding wn_concept_vocab_size = wn_concept_embedding_mat.shape[0] wn_concept_dim = wn_concept_embedding_mat.shape[1] nell_concept_vocab_size = nell_concept_embedding_mat.shape[0] nell_concept_dim = nell_concept_embedding_mat.shape[1] wn_memory_embs = fluid.layers.embedding(wn_concept_ids, size=(wn_concept_vocab_size, wn_concept_dim), param_attr=fluid.ParamAttr(name="wn_concept_emb_mat", do_model_average=False, trainable=False), dtype='float32') nell_memory_embs = fluid.layers.embedding(nell_concept_ids, size=(nell_concept_vocab_size, nell_concept_dim), param_attr=fluid.ParamAttr(name="nell_concept_emb_mat", do_model_average=False, trainable=False), dtype='float32') # get memory length wn_concept_ids_reduced = fluid.layers.equal(wn_concept_ids, fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1] wn_concept_ids_reduced = fluid.layers.cast(wn_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1] wn_concept_ids_reduced = fluid.layers.scale( fluid.layers.elementwise_sub( wn_concept_ids_reduced, fluid.layers.fill_constant([1], "float32", 1) ), scale=-1 ) wn_mem_length = fluid.layers.reduce_sum(wn_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1] nell_concept_ids_reduced = fluid.layers.equal(nell_concept_ids, fluid.layers.fill_constant(shape=[1], value=0, dtype="int64")) # [batch_size, sent_size, concept_size, 1] nell_concept_ids_reduced = fluid.layers.cast(nell_concept_ids_reduced, dtype="float32") # [batch_size, sent_size, concept_size, 1] nell_concept_ids_reduced = fluid.layers.scale( fluid.layers.elementwise_sub( nell_concept_ids_reduced, fluid.layers.fill_constant([1], "float32", 1) ), scale=-1 ) nell_mem_length = fluid.layers.reduce_sum(nell_concept_ids_reduced, dim=2) # [batch_size, sent_size, 1] # select and integrate wn_memory_layer = MemoryLayer(bert_config, max_wn_concept_length, wn_concept_dim, mem_method='raw', prefix='wn') wn_memory_output = wn_memory_layer.forward(enc_out, wn_memory_embs, wn_mem_length, ignore_no_memory_token=True) nell_memory_layer = MemoryLayer(bert_config, max_nell_concept_length, nell_concept_dim, mem_method='raw', prefix='nell') nell_memory_output = nell_memory_layer.forward(enc_out, nell_memory_embs, nell_mem_length, ignore_no_memory_token=True) memory_output = fluid.layers.concat([enc_out, wn_memory_output, nell_memory_output], axis=2) '''3rd layer: Self-Matching Layer''' # calculate input dim for self-matching layer memory_output_size = bert_config['hidden_size'] + wn_concept_dim + nell_concept_dim logger.info("memory_output_size: {}".format(memory_output_size)) # do matching self_att_layer = TriLinearTwoTimeSelfAttentionLayer( memory_output_size, dropout_rate=0.0, cat_mul=True, cat_sub=True, cat_twotime=True, cat_twotime_mul=False, cat_twotime_sub=True) # [bs, sq, concat_hs] att_output = self_att_layer.forward(memory_output, input_mask) # [bs, sq, concat_hs] '''4th layer: Output Layer''' logits = fluid.layers.fc( input=att_output, size=2, num_flatten_dims=2, param_attr=fluid.ParamAttr( name="cls_squad_out_w", initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=bert_config['initializer_range'])), bias_attr=fluid.ParamAttr( name="cls_squad_out_b", initializer=fluid.initializer.Constant(0.))) logits = fluid.layers.transpose(x=logits, perm=[2, 0, 1]) start_logits, end_logits = fluid.layers.unstack(x=logits, axis=0) batch_ones = fluid.layers.fill_constant_batch_size_like( input=start_logits, dtype='int64', shape=[1], value=1) num_seqs = fluid.layers.reduce_sum(input=batch_ones) if is_training: def compute_loss(logits, positions): loss = fluid.layers.softmax_with_cross_entropy( logits=logits, label=positions) loss = fluid.layers.mean(x=loss) return loss start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 if args.use_fp16 and args.loss_scaling > 1.0: total_loss = total_loss * args.loss_scaling return pyreader, total_loss, num_seqs else: return pyreader, unique_id, start_logits, end_logits, num_seqs
def dyanmic_gru_op(self, **kwargs): role = kwargs['role'] data = kwargs['data'] data_share = kwargs['data_share'][role] weight = kwargs['weight'] weight_share = kwargs['weight_share'][role] return_results = kwargs['return_results'] return_results_cheb = kwargs['return_results_cheb'] expected_result = kwargs['expect_results'] pfl_mpc.init("aby3", role, "localhost", self.server, int(self.port)) hidden_dim = 1 data_paddle = fluid.data(name='input_paddle', shape=[3, 3], dtype='float32', lod_level=1) ldata_paddle = fluid.create_lod_tensor(data, [[3]], fluid.CPUPlace()) w_param_attrs = fluid.ParamAttr( name='gru_weight', learning_rate=0.5, initializer=fluid.initializer.NumpyArrayInitializer(weight), trainable=True) hidden_paddle = fluid.layers.dynamic_gru(input=data_paddle, size=hidden_dim, param_attr=w_param_attrs, gate_activation='sigmoid', candidate_activation='relu') data_mpc = fluid.data(name='input_mpc', shape=[3, 2, 3], dtype='int64', lod_level=1) # trans batch information to shape[0] data_share_trans = np.transpose(data_share, [1, 0, 2]) ldata_mpc = fluid.create_lod_tensor(data_share_trans, [[3]], fluid.CPUPlace()) w_param_attrs1 = fluid.ParamAttr( name='mpc_gru_weight', learning_rate=0.5, initializer=pfl_mpc.initializer.NumpyArrayInitializer( weight_share), trainable=True) w_param_attrs2 = fluid.ParamAttr( name='mpc_gru_weight_cheb', learning_rate=0.5, initializer=pfl_mpc.initializer.NumpyArrayInitializer( weight_share), trainable=True) hidden_mpc = pfl_mpc.layers.dynamic_gru(input=data_mpc, size=hidden_dim, param_attr=w_param_attrs1) hidden_mpc_cheb = pfl_mpc.layers.dynamic_gru( input=data_mpc, size=hidden_dim, param_attr=w_param_attrs2, gate_activation='sigmoid_chebyshev') exe = fluid.Executor(place=fluid.CPUPlace()) exe.run(fluid.default_startup_program()) results = exe.run( feed={ 'input_paddle': ldata_paddle, 'input_mpc': ldata_mpc }, fetch_list=[hidden_paddle, hidden_mpc, hidden_mpc_cheb], return_numpy=False) return_results.append(np.array(results[1])) return_results_cheb.append(np.array(results[2])) expected_result.append(np.array(results[0]))