def __init__(self, hp, voca_size, is_training=True): config = BertConfig(vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pooled_output = self.model.get_pooled_output() task = ClassificationB(is_training, hp.hidden_units, 3) task.call(pooled_output, label_ids) self.loss = task.loss self.logits = task.logits self.acc = task.acc
def __init__(self, hp, num_classes, voca_size, is_training=True): config = bert.BertConfig(vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pooled_output = self.model.get_pooled_output() output_weights = tf.compat.v1.get_variable( "output_weights", [num_classes, hp.hidden_units], initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.02) ) output_bias = tf.compat.v1.get_variable( "output_bias", [num_classes], initializer=tf.compat.v1.zeros_initializer() ) logits = tf.matmul(pooled_output, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) loss_arr = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=label_ids) loss = tf.reduce_mean(input_tensor=loss_arr) self.loss = loss self.logits = logits self.sout = tf.nn.softmax(self.logits)
def model_tag(self, sequence_output, seq_max, var_name): ex_label = placeholder(tf.int32, [None, seq_max]) valid_mask = placeholder(tf.float32, [None, 1]) with tf.variable_scope(var_name): ex_logits = tf.layers.dense(sequence_output, 2, name=var_name) ex_prob = tf.nn.softmax(ex_logits)[:, :, 1] losses = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot(ex_label, 2), logits=ex_logits) losses = valid_mask * losses loss = tf.reduce_mean(losses) return { 'labels': ex_label, 'mask': valid_mask, 'ex_logits': ex_logits, 'score': ex_prob, 'losses': losses, 'loss': loss }
def model_tag(self, sequence_output, seq_max, var_name): ex_labels = placeholder(tf.float32, [None, seq_max]) valid_mask = placeholder(tf.float32, [None, 1]) with tf.variable_scope(var_name): ex_logits = tf.layers.dense(sequence_output, 1, name=var_name) ex_logits = tf.reshape(ex_logits, [-1, seq_max]) labels_ = tf.cast(tf.greater(ex_labels, 0), tf.float32) losses = tf_module.correlation_coefficient_loss(ex_logits, -labels_) losses = valid_mask * losses loss = tf.reduce_mean(losses) score = ex_logits return { 'labels': ex_labels, 'mask':valid_mask, 'ex_logits': ex_logits, 'score': score, 'losses':losses, 'loss': loss }
def __init__(self, hp, voca_size, method, is_training=True): config = BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = BertModel(config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc
def __init__(self, hp, voca_size, method, is_training=True): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) if method in [0, 1, 3, 4, 5, 6]: self.rf_mask = placeholder(tf.float32, [None, seq_length]) elif method in [METHOD_CROSSENT, METHOD_HINGE]: self.rf_mask = placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pooled = self.model.get_pooled_output() pooled = tf.nn.dropout(pooled, hp.dropout_rate) logits = tf.layers.dense(pooled, data_generator.NLI.nli_info.num_classes, name="cls_dense") labels = tf.one_hot(label_ids, data_generator.NLI.nli_info.num_classes) self.acc = tf_module.accuracy(logits, label_ids) self.logits = logits tf.summary.scalar("acc", self.acc) self.loss_arr = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=labels) self.loss = tf.reduce_mean(self.loss_arr) tf.summary.scalar("loss", self.loss)
def __init__(self, hp, voca_size, method, is_training=True): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) if method in [0, 1, 3, 4, 5, 6]: self.rf_mask = placeholder(tf.float32, [None, seq_length]) elif method in [METHOD_CROSSENT, METHOD_HINGE]: self.rf_mask = placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc tf.summary.scalar('loss', self.loss) tf.summary.scalar('acc', self.acc) if method == 0: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) cl = tf.nn.sigmoid(cl) # cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl # self.pkc = self.conf_logits * self.rf_mask # rl_loss_list = tf.reduce_sum(self.pkc, axis=1) rl_loss_list = tf.reduce_sum(self.conf_logits * tf.cast(self.rf_mask, tf.float32), axis=1) self.rl_loss = tf.reduce_mean(rl_loss_list) elif method == 1: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl #rl_loss_list = tf_module.cossim(cl, self.rf_mask) #self.pkc = self.conf_logits * self.rf_mask rl_loss_list = tf.reduce_sum(self.conf_logits * self.rf_mask, axis=1) self.rl_loss = tf.reduce_mean(rl_loss_list) elif method == METHOD_CROSSENT: cl = tf.layers.dense(self.model.get_sequence_output(), 2, name="aux_conflict") probs = tf.nn.softmax(cl) losses = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot( self.rf_mask, 2), logits=cl) self.conf_logits = probs[:, :, 1] - 0.5 self.rl_loss = tf.reduce_mean(losses) elif method == 3: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) self.bias = tf.Variable(0.0) self.conf_logits = (cl + self.bias) rl_loss_list = tf.nn.relu(1 - self.conf_logits * self.rf_mask) rl_loss_list = tf.reduce_mean(rl_loss_list, axis=1) self.rl_loss = tf.reduce_mean(rl_loss_list) labels = tf.greater(self.rf_mask, 0) hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits) self.hinge_loss = tf.reduce_sum(hinge_losses) elif method == 4: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl labels = tf.greater(self.rf_mask, 0) hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits) self.rl_loss = hinge_losses elif method == 5: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) #cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl self.labels = tf.cast(tf.greater(self.rf_mask, 0), tf.float32) self.rl_loss = tf.reduce_mean( tf_module.correlation_coefficient_loss(cl, -self.rf_mask)) elif method == 6: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") #cl = tf.layers.dense(cl1, 1, name="aux_conflict2") cl = tf.reshape(cl, [-1, seq_length]) #cl = tf.nn.sigmoid(cl) #cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl #rl_loss_list = tf.reduce_sum(self.conf_logits * self.rf_mask , axis=1) self.rl_loss = tf.reduce_mean( tf_module.correlation_coefficient_loss(cl, -self.rf_mask)) elif method == METHOD_HINGE: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) self.conf_logits = cl labels = tf.greater(self.rf_mask, 0) hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits) self.rl_loss = tf.reduce_sum(hinge_losses) self.conf_softmax = tf.nn.softmax(self.conf_logits, axis=-1)
def run(): all_loss = 0 tower_grads = [] input_x_list = [] input_y_list = [] models = [] for gpu_idx in range(2): with tf.device("/gpu:{}".format(gpu_idx)): with tf.variable_scope("vars", reuse=gpu_idx > 0): input_x = placeholder(tf.float32, [None, 10]) input_y = placeholder(tf.int32, [ None, ]) input_x_list.append(input_x) input_y_list.append(input_y) model = FF(input_x, input_y) models.append(model) tf.get_variable_scope().reuse_variables() all_loss += model.task.loss tvars = tf.trainable_variables() for t in tvars: print(t.name) for gpu_idx in range(2): grads = tf.gradients(model.task.loss, tvars) print(grads) # Keep track of the gradients across all towers. tower_grads.append(grads) avg_grads = [] for t_idx, _ in enumerate(tvars): g1 = tower_grads[0][0] g2 = tower_grads[1][1] g_avg = (g1 + g2) / 2 if g1 is not None else None avg_grads.append(g_avg) global_step = tf.Variable(0, name='global_step') optimizer = AdamWeightDecayOptimizer( learning_rate=0.001, weight_decay_rate=0.02, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) train_cls = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) #train_cls = get_train_op2(all_loss, 0.001, "adam", 10000) sess = init_session() sess.run(tf.global_variables_initializer()) def train_classification(i): if i % 2 == 0: random_input = np.ones([ batch_size, ], ) else: random_input = np.zeros([ batch_size, ]) random_input = np.ones([ batch_size, ], ) loss_val, _ = sess.run( [model.task.loss, train_cls], feed_dict={ input_x_list[0]: np.ones([batch_size, 10]), input_x_list[1]: np.ones([batch_size, 10]), input_y_list[0]: np.zeros([ batch_size, ]), input_y_list[1]: random_input, }) print(loss_val) for i in range(10): print("Train") train_classification(i)