def net(self, input, is_infer=False): """ network""" text = input[0] pos_tag = input[1] neg_tag = input[2] text_emb = fluid.embedding(input=text, size=[self.vocab_text_size, self.emb_dim], param_attr="text_emb") text_emb = fluid.layers.squeeze(input=text_emb, axes=[1]) pos_tag_emb = fluid.embedding(input=pos_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") pos_tag_emb = fluid.layers.squeeze(input=pos_tag_emb, axes=[1]) neg_tag_emb = fluid.embedding(input=neg_tag, size=[self.vocab_tag_size, self.emb_dim], param_attr="tag_emb") neg_tag_emb = fluid.layers.squeeze(input=neg_tag_emb, axes=[1]) conv_1d = fluid.nets.sequence_conv_pool(input=text_emb, num_filters=self.hid_dim, filter_size=self.win_size, act="tanh", pool_type="max", param_attr="cnn") text_hid = fluid.layers.fc(input=conv_1d, size=self.emb_dim, param_attr="text_hid") cos_pos = nn.cos_sim(pos_tag_emb, text_hid) mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=self.neg_size) #choose max negtive cosine cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True) #calculate hinge loss loss_part1 = nn.elementwise_sub( tensor.fill_constant_batch_size_like(input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'), cos_pos) loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part3 = nn.elementwise_max( tensor.fill_constant_batch_size_like(input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2) avg_cost = nn.mean(loss_part3) less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') correct = nn.reduce_sum(less) self._cost = avg_cost if is_infer: self._infer_results["correct"] = correct self._infer_results["cos_pos"] = cos_pos else: self._metrics["correct"] = correct self._metrics["cos_pos"] = cos_pos
def network(vocab_text_size, vocab_tag_size, emb_dim=10, hid_dim=1000, win_size=5, margin=0.1, neg_size=5): """ network definition """ text = io.data(name="text", shape=[1], lod_level=1, dtype='int64') pos_tag = io.data(name="pos_tag", shape=[1], lod_level=1, dtype='int64') neg_tag = io.data(name="neg_tag", shape=[1], lod_level=1, dtype='int64') text_emb = nn.embedding(input=text, size=[vocab_text_size, emb_dim], param_attr="text_emb") pos_tag_emb = nn.embedding(input=pos_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") neg_tag_emb = nn.embedding(input=neg_tag, size=[vocab_tag_size, emb_dim], param_attr="tag_emb") conv_1d = fluid.nets.sequence_conv_pool(input=text_emb, num_filters=hid_dim, filter_size=win_size, act="tanh", pool_type="max", param_attr="cnn") text_hid = fluid.layers.fc(input=conv_1d, size=emb_dim, param_attr="text_hid") cos_pos = nn.cos_sim(pos_tag_emb, text_hid) mul_text_hid = fluid.layers.sequence_expand_as(x=text_hid, y=neg_tag_emb) mul_cos_neg = nn.cos_sim(neg_tag_emb, mul_text_hid) cos_neg_all = fluid.layers.sequence_reshape(input=mul_cos_neg, new_dim=neg_size) #choose max negtive cosine cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True) #calculate hinge loss loss_part1 = nn.elementwise_sub( tensor.fill_constant_batch_size_like(input=cos_pos, shape=[-1, 1], value=margin, dtype='float32'), cos_pos) loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part3 = nn.elementwise_max( tensor.fill_constant_batch_size_like(input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2) avg_cost = nn.mean(loss_part3) less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32') correct = nn.reduce_sum(less) return avg_cost, correct, cos_pos
def train(self): user_data = io.data(name="user", shape=[1], dtype="int64", lod_level=1) pos_item_data = io.data(name="p_item", shape=[1], dtype="int64", lod_level=1) neg_item_data = io.data(name="n_item", shape=[1], dtype="int64", lod_level=1) user_emb = nn.embedding(input=user_data, size=self.emb_shape, param_attr="emb.item") pos_item_emb = nn.embedding(input=pos_item_data, size=self.emb_shape, param_attr="emb.item") neg_item_emb = nn.embedding(input=neg_item_data, size=self.emb_shape, param_attr="emb.item") user_enc = self.user_encoder.forward(user_emb) pos_item_enc = self.item_encoder.forward(pos_item_emb) neg_item_enc = self.item_encoder.forward(neg_item_emb) user_hid = nn.fc(input=user_enc, size=self.hidden_size, param_attr='user.w', bias_attr="user.b") pos_item_hid = nn.fc(input=pos_item_enc, size=self.hidden_size, param_attr='item.w', bias_attr="item.b") neg_item_hid = nn.fc(input=neg_item_enc, size=self.hidden_size, param_attr='item.w', bias_attr="item.b") cos_pos = nn.cos_sim(user_hid, pos_item_hid) cos_neg = nn.cos_sim(user_hid, neg_item_hid) hinge_loss = self.pairwise_hinge_loss.forward(cos_pos, cos_neg) avg_cost = nn.mean(hinge_loss) correct = self.get_correct(cos_neg, cos_pos) return [user_data, pos_item_data, neg_item_data], cos_pos, avg_cost, correct
def softmax_classify(self, x, label, param_attr=None, use_bias=True, bias_attr=None): flatten_dim = reduce(lambda a, b: a * b, x.shape[1:], 1) weight, bias = self.create_parameter(dtype=x.dtype, in_dim=flatten_dim, param_attr=param_attr, bias_attr=bias_attr, use_bias=use_bias) x_all = collective._c_allgather(x, nranks=self.nranks, use_calc_stream=True) label_all = collective._c_allgather(label, nranks=self.nranks, use_calc_stream=True) label_all.stop_gradient = True shard_fc = nn.mul(x_all, weight, x_num_col_dims=1) if use_bias: shard_fc = nn.elementwise_add(shard_fc, bias) shard_label = nn.shard_index(label_all, index_num=self.nclasses, nshards=self.nranks, shard_id=self.rank_id, ignore_value=-1) shard_label.stop_gradient = True global_loss, shard_prob = self.softmax_with_cross_entropy( shard_fc, shard_label) avg_loss = nn.mean(global_loss) avg_loss._set_info('shard_logit', shard_fc) avg_loss._set_info('shard_prob', shard_prob) avg_loss._set_info('shard_label', shard_label) avg_loss._set_info('shard_dim', self.shard_dim) return avg_loss
def train_net(self): # input fields for query, pos_title, neg_title q_slots = [ io.data(name="q%d" % i, shape=[1], lod_level=1, dtype='int64') for i in range(len(self.query_encoders)) ] pt_slots = [ io.data(name="pt%d" % i, shape=[1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] nt_slots = [ io.data(name="nt%d" % i, shape=[1], lod_level=1, dtype='int64') for i in range(len(self.title_encoders)) ] # lookup embedding for each slot q_embs = [ nn.embedding(input=query, size=self.emb_shape, param_attr="emb") for query in q_slots ] pt_embs = [ nn.embedding(input=title, size=self.emb_shape, param_attr="emb") for title in pt_slots ] nt_embs = [ nn.embedding(input=title, size=self.emb_shape, param_attr="emb") for title in nt_slots ] # encode each embedding field with encoder q_encodes = [ self.query_encoders[i].forward(emb) for i, emb in enumerate(q_embs) ] pt_encodes = [ self.title_encoders[i].forward(emb) for i, emb in enumerate(pt_embs) ] nt_encodes = [ self.title_encoders[i].forward(emb) for i, emb in enumerate(nt_embs) ] # concat multi view for query, pos_title, neg_title q_concat = nn.concat(q_encodes) pt_concat = nn.concat(pt_encodes) nt_concat = nn.concat(nt_encodes) # projection of hidden layer q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b') pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b') # cosine of hidden layers cos_pos = nn.cos_sim(q_hid, pt_hid) cos_neg = nn.cos_sim(q_hid, nt_hid) # pairwise hinge_loss loss_part1 = nn.elementwise_sub( tensor.fill_constant_batch_size_like(input=cos_pos, shape=[-1, 1], value=self.margin, dtype='float32'), cos_pos) loss_part2 = nn.elementwise_add(loss_part1, cos_neg) loss_part3 = nn.elementwise_max( tensor.fill_constant_batch_size_like(input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'), loss_part2) avg_cost = nn.mean(loss_part3) correct = self.get_correct(cos_neg, cos_pos) return q_slots + pt_slots + nt_slots, avg_cost, correct
def arcface_classify(self, x, label, margin=0.5, logit_scale=64, param_attr=None): ''' reference: ArcFace. https://arxiv.org/abs/1801.07698 ''' flatten_dim = reduce(lambda a, b: a * b, x.shape[1:], 1) weight, bias = self.create_parameter(dtype=x.dtype, in_dim=flatten_dim, param_attr=param_attr, use_bias=False) # normalize x x_l2 = ops.sqrt(nn.reduce_sum(ops.square(x), dim=1)) norm_x = nn.elementwise_div(x, x_l2, axis=0) norm_x_all = collective._c_allgather(norm_x, nranks=self.nranks, use_calc_stream=True) label_all = collective._c_allgather(label, nranks=self.nranks, use_calc_stream=True) label_all.stop_gradient = True shard_label = nn.shard_index(label_all, index_num=self.nclasses, nshards=self.nranks, shard_id=self.rank_id, ignore_value=-1) # TODO check necessary shard_label.stop_gradient = True # normalize weight weight_l2 = ops.sqrt(nn.reduce_sum(ops.square(weight), dim=0)) norm_weight = nn.elementwise_div(weight, weight_l2, axis=1) shard_cos = nn.mul(norm_x_all, norm_weight, x_num_col_dims=1) theta = ops.acos(shard_cos) margin_cos = ops.cos(theta + margin) shard_one_hot = nn.one_hot(shard_label, depth=self.shard_dim, allow_out_of_range=True) # TODO check necessary shard_one_hot.stop_gradient = True diff = (margin_cos - shard_cos) * shard_one_hot shard_target_cos = shard_cos + diff shard_logit = nn.scale(shard_target_cos, scale=logit_scale) global_loss, shard_prob = self.softmax_with_cross_entropy( shard_logit, shard_label) avg_loss = nn.mean(global_loss) avg_loss._set_info('shard_logit', shard_logit) avg_loss._set_info('shard_prob', shard_prob) avg_loss._set_info('shard_label', shard_label) avg_loss._set_info('shard_dim', self.shard_dim) return avg_loss