Esempio n. 1
0
    def __init__(self, hp, voca_size, method, is_training=True):
        config = bert.BertConfig(
            vocab_size=voca_size,
            hidden_size=hp.hidden_units,
            num_hidden_layers=hp.num_blocks,
            num_attention_heads=hp.num_heads,
            intermediate_size=hp.intermediate_size,
            type_vocab_size=hp.type_vocab_size,
        )

        seq_length = hp.seq_max
        use_tpu = False
        task = Classification(data_generator.NLI.nli_info.num_classes)

        input_ids = tf.placeholder(tf.int64, [None, seq_length])
        input_mask = tf.placeholder(tf.int64, [None, seq_length])
        segment_ids = tf.placeholder(tf.int64, [None, seq_length])
        label_ids = tf.placeholder(tf.int64, [None])
        if method in [0, 1, 3, 4, 5, 6]:
            self.rf_mask = tf.placeholder(tf.float32, [None, seq_length])
        elif method in [2]:
            self.rf_mask = tf.placeholder(tf.int32, [None, seq_length])

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y = label_ids

        use_one_hot_embeddings = use_tpu
        with tf.variable_scope("part1"):
            self.model1 = bert.BertModel(
                config=config,
                is_training=is_training,
                input_ids=input_ids,
                input_mask=input_mask,
                token_type_ids=segment_ids,
                use_one_hot_embeddings=use_one_hot_embeddings)

        with tf.variable_scope("part2"):
            self.model2 = bert.BertModel(
                config=config,
                is_training=is_training,
                input_ids=input_ids,
                input_mask=input_mask,
                token_type_ids=segment_ids,
                use_one_hot_embeddings=use_one_hot_embeddings)

        enc = tf.concat([
            self.model1.get_sequence_output(),
            self.model2.get_sequence_output()
        ],
                        axis=2)

        pred, loss = task.predict(enc, label_ids, True)

        self.logits = task.logits
        self.sout = tf.nn.softmax(self.logits)
        self.pred = pred
        self.loss = loss
        self.acc = task.acc
Esempio n. 2
0
    def __init__(self, hp, voca_size, method, is_training=True):
        config = bert.BertConfig(
            vocab_size=voca_size,
            hidden_size=hp.hidden_units,
            num_hidden_layers=hp.num_blocks,
            num_attention_heads=hp.num_heads,
            intermediate_size=hp.intermediate_size,
            type_vocab_size=hp.type_vocab_size,
        )

        seq_length = hp.seq_max
        use_tpu = False
        task = Classification(data_generator.NLI.nli_info.num_classes)

        input_ids = tf.placeholder(tf.int64, [None, seq_length])
        input_mask = tf.placeholder(tf.int64, [None, seq_length])
        segment_ids = tf.placeholder(tf.int64, [None, seq_length])
        label_ids = tf.placeholder(tf.int64, [None])
        if method in [0, 1, 3, 4, 5, 6]:
            self.rf_mask = tf.placeholder(tf.float32, [None, seq_length])
        elif method in [METHOD_CROSSENT, METHOD_HINGE]:
            self.rf_mask = tf.placeholder(tf.int32, [None, seq_length])

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y = label_ids

        use_one_hot_embeddings = use_tpu
        self.model = bert_get_hidden.BertModel(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        pred, loss = task.predict(self.model.get_sequence_output(), label_ids,
                                  True)

        self.logits = task.logits
        self.sout = tf.nn.softmax(self.logits)
        self.pred = pred
        self.loss = loss

        all_layer_grads = []
        all_layers = self.model.all_layer_outputs
        for i in range(len(all_layers)):
            grad = tf.gradients(self.logits, all_layers[i])
            all_layer_grads.append(grad)

        grad_emb = tf.gradients(self.logits, self.model.embedding_output)
        self.all_layer_grads = all_layer_grads
        self.grad_emb = grad_emb
Esempio n. 3
0
    def __init__(self, hp, voca_size, mode=1):
        config = bert.BertConfig(
            vocab_size=voca_size,
            hidden_size=hp.hidden_units,
            num_hidden_layers=hp.num_blocks,
            num_attention_heads=hp.num_heads,
            intermediate_size=hp.intermediate_size,
            type_vocab_size=hp.type_vocab_size,
        )

        seq_length = hp.seq_max
        use_tpu = False
        task = Classification(data_generator.NLI.nli_info.num_classes)

        input_ids = tf.placeholder(tf.int64, [None, seq_length])
        input_mask = tf.placeholder(tf.int64, [None, seq_length])
        segment_ids = tf.placeholder(tf.int64, [None, seq_length])
        scores = tf.placeholder(tf.float32, [None])
        #        self.rf_mask = tf.placeholder(tf.float32, [None, seq_length])
        self.rf_mask = tf.placeholder(tf.int32, [None, seq_length])

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y = scores

        use_one_hot_embeddings = use_tpu
        is_training = True
        self.model = bert.BertModel(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        if mode == 1:
            enc = self.model.get_pooled_output()
        else:
            enc = self.model.get_all_encoder_layers()
        self.enc = enc
        logits = tf.layers.dense(enc, 1, name="reg_dense")  # [ None, 1]
        self.logits = logits

        paired = tf.reshape(logits, [-1, 2])
        y_paired = tf.reshape(self.y, [-1, 2])
        raw_l = (paired[:, 1] - paired[:, 0])
        losses = tf.maximum(hp.alpha - (paired[:, 1] - paired[:, 0]), 0)

        self.loss = tf.reduce_mean(losses)
        tf.summary.scalar('loss', self.loss)
Esempio n. 4
0
    def stance_cold(self):
        hp = hyperparams.HPColdStart()
        topic = "atheism"
        setting = shared_setting.TopicTweets2Stance(topic)
        model_dir = get_model_dir("stance_cold_{}".format(topic))

        task = Classification(3)
        model = Transformer(hp, setting.vocab_size, task)
        param = {
            'feature_columns': self.get_feature_column(),
            'n_classes': 3,
        }
        estimator = tf.estimator.Estimator(
            model_fn=model.model_fn,
            model_dir=model_dir,
            params=param,
            config=None)

        data_source = stance_detection.DataLoader(topic, hp.seq_max, setting.vocab_filename)

        def train_input_fn(features, labels, batch_size):
            f_dict = pd.DataFrame(data=features)
            dataset = tf.data.Dataset.from_tensor_slices((f_dict, labels))
            # Shuffle, repeat, and batch the examples.
            return dataset.shuffle(1000).repeat().batch(batch_size)

        def dev_input_fn(batch_size):
            features, labels = data_source.get_dev_data()
            f_dict = pd.DataFrame(data=features)
            dataset = tf.data.Dataset.from_tensor_slices((f_dict, labels))
            # Shuffle, repeat, and batch the examples.
            return dataset.shuffle(1000).batch(batch_size)

        X, Y = data_source.get_train_data()
        num_epoch = 10
        batch_size = 32
        step_per_epoch = (len(Y)-1) / batch_size + 1
        tf.logging.info("Logging Test")
        tf.logging.info("num epoch %d", num_epoch)
        estimator.train(lambda:train_input_fn(X, Y, batch_size),
                        max_steps=num_epoch * step_per_epoch)

        print(estimator.evaluate(lambda:dev_input_fn(batch_size)))
Esempio n. 5
0
    def __init__(self, hp, voca_size, is_training=True):
        config = bert.BertConfig(
            vocab_size=voca_size,
            hidden_size=hp.hidden_units,
            num_hidden_layers=hp.num_blocks,
            num_attention_heads=hp.num_heads,
            intermediate_size=hp.intermediate_size,
            type_vocab_size=hp.type_vocab_size,
        )

        seq_length = hp.seq_max
        use_tpu = False
        task = Classification(2)

        input_ids = tf.placeholder(tf.int64, [None, seq_length])
        input_mask = tf.placeholder(tf.int64, [None, seq_length])
        segment_ids = tf.placeholder(tf.int64, [None, seq_length])
        label_ids = tf.placeholder(tf.int64, [None])

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y = label_ids

        use_one_hot_embeddings = use_tpu
        self.model = bert.BertModel(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        pred, loss = task.predict(self.model.get_sequence_output(), label_ids,
                                  True)

        self.logits = task.logits
        self.sout = tf.nn.softmax(self.logits)
        self.pred = pred
        self.loss = loss
        self.acc = task.acc
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('acc', self.acc)
Esempio n. 6
0
    def train_classification(self, data_loader):
        hp = HP()
        tpu_cluster_resolver = None

        if FLAGS.use_tpu:
            model_dir = FLAGS.model_dir
            hp.batch_size = FLAGS.batch_size
            data_dir = FLAGS.data_dir
            input_pattern = os.path.join(data_dir, "Thus.train_*")
            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu)
            init_checkpoint = FLAGS.init_checkpoint
        else:
            model_dir = get_model_dir("causal")
            input_pattern = os.path.join(cpath.data_path, "causal", "Thus.train_*")
            init_checkpoint = os.path.join(cpath.model_path, "runs", FLAGS.init_checkpoint)


        vocab_size = 30522

        task = Classification(3)
        model = transformer_est.TransformerEst(hp, vocab_size, task, FLAGS.use_tpu, init_checkpoint)

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=None,
            model_dir=model_dir,
            save_checkpoints_steps=1000,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=1000,
                num_shards=8,
                per_host_input_for_training=is_per_host))

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=FLAGS.use_tpu,
            model_fn=model.model_fn,
            config=run_config,
            train_batch_size=hp.batch_size,
            eval_batch_size=hp.batch_size)

        input_files = tf.gfile.Glob(input_pattern)
        for input_file in input_files:
            tf.logging.info("  %s" % input_file)

        train_files = data_loader.get_train
        eval_files = input_files[:1]
        tf.enable_eager_execution()

        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", hp.batch_size)
        train_input_fn = input_fn_builder(
            input_files=train_files,
            max_seq_length=hp.seq_max,
            is_training=True)

        # `sloppy` mode means that the interleaving is not exact. This adds
        # even more randomness to the training pipeline.
        class _LoggerHook(tf.train.SessionRunHook):
            def __init__(self, log_frequency):
                self.log_frequency = log_frequency

            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(task.loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % self.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = self.log_frequency * 16 / duration
                    sec_per_batch = float(duration / self.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        hook = _LoggerHook(100)
        estimator.train(input_fn=train_input_fn,
                        hooks= [hook],
                        max_steps = FLAGS.train_steps
                        )

        eval_input_fn = input_fn_builder(
            input_files=eval_files,
            max_seq_length=hp.seq_max,
            is_training=False)

        result = estimator.evaluate(
            input_fn=eval_input_fn,
            steps=20,
            )

        tf.logging.info("***** Eval results *****")
        for key in sorted(result.keys()):
            tf.logging.info("  %s = %s", key, str(result[key]))
Esempio n. 7
0
    def __init__(self, hp, voca_size, num_class_list, is_training=True):
        config = bert.BertConfig(
            vocab_size=voca_size,
            hidden_size=hp.hidden_units,
            num_hidden_layers=hp.num_blocks,
            num_attention_heads=hp.num_heads,
            intermediate_size=hp.intermediate_size,
            type_vocab_size=hp.type_vocab_size,
        )
        seq_length = hp.seq_max
        use_tpu = False

        input_ids = tf.placeholder(tf.int64, [None, seq_length],
                                   name="input_ids")
        input_mask = tf.placeholder(tf.int64, [None, seq_length],
                                    name="input_mask")
        segment_ids = tf.placeholder(tf.int64, [None, seq_length],
                                     name="segment_ids")

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y1 = tf.placeholder(tf.int64, [None], name="y1")
        self.y2 = tf.placeholder(tf.int64, [None], name="y2")
        self.y = [self.y1, self.y2]
        summary1 = {}
        summary2 = {}
        self.summary_list = [summary1, summary2]

        use_one_hot_embeddings = use_tpu
        self.model = bert.BertModel(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        task = Classification(num_class_list[0])
        pred, loss = task.predict(self.model.get_sequence_output(), self.y1,
                                  True)
        self.logits = task.logits
        self.sout = tf.nn.softmax(self.logits)
        self.pred = pred
        self.loss = loss
        self.acc = task.acc
        summary1['loss1'] = tf.summary.scalar('loss', self.loss)
        summary1['acc1'] = tf.summary.scalar('acc', self.acc)

        with tf.variable_scope("cls2"):
            task2 = Classification(num_class_list[1])
            pred, loss = task2.predict(self.model.get_sequence_output(),
                                       self.y2, True)
            self.logits2 = task2.logits
            self.sout2 = tf.nn.softmax(self.logits2)
            self.pred2 = pred
            self.loss2 = loss
            self.acc2 = task2.acc
            summary2['loss2'] = tf.summary.scalar('loss2', self.loss2)
            summary2['acc2'] = tf.summary.scalar('acc2', self.acc2)

        self.logit_list = [self.logits, self.logits2]
        self.loss_list = [self.loss, self.loss2]
        self.pred_list = [self.pred, self.pred2]
Esempio n. 8
0
    def __init__(self, hp, voca_size, is_training):
        config = bert.BertConfig(
            vocab_size=voca_size,
            hidden_size=hp.hidden_units,
            num_hidden_layers=hp.num_blocks,
            num_attention_heads=hp.num_heads,
            intermediate_size=hp.intermediate_size,
            type_vocab_size=hp.type_vocab_size,
        )

        seq_length = hp.seq_max
        use_tpu = False
        task = Classification(data_generator.NLI.nli_info.num_classes)

        input_ids = tf.placeholder(tf.int64, [None, seq_length])
        input_mask = tf.placeholder(tf.int64, [None, seq_length])
        segment_ids = tf.placeholder(tf.int64, [None, seq_length])
        label_ids = tf.placeholder(tf.int64, [None])
        #        self.rf_mask = tf.placeholder(tf.float32, [None, seq_length])
        self.rf_mask = tf.placeholder(tf.int32, [None, seq_length])

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y = label_ids
        self.encoded_embedding_in = tf.placeholder(
            tf.float32, [None, seq_length, hp.hidden_units])
        self.attention_mask_in = tf.placeholder(tf.float32,
                                                [None, seq_length, seq_length])
        use_one_hot_embeddings = use_tpu
        self.model = bert.BertEmbeddingInOut(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings,
            embeddding_as_input=(self.encoded_embedding_in,
                                 self.attention_mask_in),
        )

        self.encoded_embedding_out = self.model.embedding_output
        self.attention_mask_out = self.model.attention_mask

        pred, loss = task.predict(self.model.get_sequence_output(), label_ids,
                                  True)

        self.logits = task.logits
        self.sout = tf.nn.softmax(self.logits)
        self.pred = pred
        self.loss = loss
        self.acc = task.acc
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('acc', self.acc)

        cl = tf.layers.dense(self.model.get_sequence_output(),
                             1,
                             name="aux_conflict")
        cl = tf.reshape(cl, [-1, seq_length])
        #cl = tf.nn.sigmoid(cl)
        #cl = tf.contrib.layers.layer_norm(cl)
        self.conf_logits = cl
        #self.pkc = self.conf_logits * self.rf_mask
        #rl_loss_list = tf.reduce_sum(self.pkc, axis=1)
        rl_loss_list = tf.reduce_sum(self.conf_logits *
                                     tf.cast(self.rf_mask, tf.float32),
                                     axis=1)

        num_tagged = tf.nn.relu(self.conf_logits + 1)
        self.verbose_loss = tf.reduce_mean(tf.reduce_sum(num_tagged, axis=1))
        self.rl_loss = tf.reduce_mean(rl_loss_list)
Esempio n. 9
0
    def __init__(self, hp, voca_size, method, is_training=True):
        config = bert.BertConfig(
            vocab_size=voca_size,
            hidden_size=hp.hidden_units,
            num_hidden_layers=hp.num_blocks,
            num_attention_heads=hp.num_heads,
            intermediate_size=hp.intermediate_size,
            type_vocab_size=hp.type_vocab_size,
        )

        seq_length = hp.seq_max
        use_tpu = False
        task = Classification(data_generator.NLI.nli_info.num_classes)

        input_ids = placeholder(tf.int64, [None, seq_length])
        input_mask = placeholder(tf.int64, [None, seq_length])
        segment_ids = placeholder(tf.int64, [None, seq_length])
        label_ids = placeholder(tf.int64, [None])
        if method in [0, 1, 3, 4, 5, 6]:
            self.rf_mask = placeholder(tf.float32, [None, seq_length])
        elif method in [METHOD_CROSSENT, METHOD_HINGE]:
            self.rf_mask = placeholder(tf.int32, [None, seq_length])

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y = label_ids

        use_one_hot_embeddings = use_tpu
        self.model = bert.BertModel(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        pred, loss = task.predict(self.model.get_sequence_output(), label_ids,
                                  True)

        self.logits = task.logits
        self.sout = tf.nn.softmax(self.logits)
        self.pred = pred
        self.loss = loss
        self.acc = task.acc
        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('acc', self.acc)
        if method == 0:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 1,
                                 name="aux_conflict")
            cl = tf.reshape(cl, [-1, seq_length])
            cl = tf.nn.sigmoid(cl)
            # cl = tf.contrib.layers.layer_norm(cl)
            self.conf_logits = cl
            # self.pkc = self.conf_logits * self.rf_mask
            # rl_loss_list = tf.reduce_sum(self.pkc, axis=1)
            rl_loss_list = tf.reduce_sum(self.conf_logits *
                                         tf.cast(self.rf_mask, tf.float32),
                                         axis=1)
            self.rl_loss = tf.reduce_mean(rl_loss_list)
        elif method == 1:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 1,
                                 name="aux_conflict")
            cl = tf.reshape(cl, [-1, seq_length])
            cl = tf.contrib.layers.layer_norm(cl)
            self.conf_logits = cl
            #rl_loss_list = tf_module.cossim(cl, self.rf_mask)
            #self.pkc = self.conf_logits * self.rf_mask
            rl_loss_list = tf.reduce_sum(self.conf_logits * self.rf_mask,
                                         axis=1)
            self.rl_loss = tf.reduce_mean(rl_loss_list)
        elif method == METHOD_CROSSENT:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 2,
                                 name="aux_conflict")
            probs = tf.nn.softmax(cl)
            losses = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot(
                self.rf_mask, 2),
                                                     logits=cl)
            self.conf_logits = probs[:, :, 1] - 0.5
            self.rl_loss = tf.reduce_mean(losses)
        elif method == 3:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 1,
                                 name="aux_conflict")
            cl = tf.reshape(cl, [-1, seq_length])
            self.bias = tf.Variable(0.0)
            self.conf_logits = (cl + self.bias)
            rl_loss_list = tf.nn.relu(1 - self.conf_logits * self.rf_mask)
            rl_loss_list = tf.reduce_mean(rl_loss_list, axis=1)
            self.rl_loss = tf.reduce_mean(rl_loss_list)
            labels = tf.greater(self.rf_mask, 0)
            hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits)
            self.hinge_loss = tf.reduce_sum(hinge_losses)
        elif method == 4:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 1,
                                 name="aux_conflict")
            cl = tf.reshape(cl, [-1, seq_length])
            cl = tf.contrib.layers.layer_norm(cl)
            self.conf_logits = cl
            labels = tf.greater(self.rf_mask, 0)
            hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits)
            self.rl_loss = hinge_losses
        elif method == 5:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 1,
                                 name="aux_conflict")
            cl = tf.reshape(cl, [-1, seq_length])
            #cl = tf.contrib.layers.layer_norm(cl)
            self.conf_logits = cl
            self.labels = tf.cast(tf.greater(self.rf_mask, 0), tf.float32)
            self.rl_loss = tf.reduce_mean(
                tf_module.correlation_coefficient_loss(cl, -self.rf_mask))
        elif method == 6:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 1,
                                 name="aux_conflict")
            #cl = tf.layers.dense(cl1, 1, name="aux_conflict2")
            cl = tf.reshape(cl, [-1, seq_length])
            #cl = tf.nn.sigmoid(cl)
            #cl = tf.contrib.layers.layer_norm(cl)
            self.conf_logits = cl
            #rl_loss_list = tf.reduce_sum(self.conf_logits * self.rf_mask , axis=1)
            self.rl_loss = tf.reduce_mean(
                tf_module.correlation_coefficient_loss(cl, -self.rf_mask))
        elif method == METHOD_HINGE:
            cl = tf.layers.dense(self.model.get_sequence_output(),
                                 1,
                                 name="aux_conflict")
            cl = tf.reshape(cl, [-1, seq_length])
            self.conf_logits = cl
            labels = tf.greater(self.rf_mask, 0)
            hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits)
            self.rl_loss = tf.reduce_sum(hinge_losses)

        self.conf_softmax = tf.nn.softmax(self.conf_logits, axis=-1)
Esempio n. 10
0
    def __init__(self, hp, voca_size, method, is_training=True):
        config = bert.BertConfig(vocab_size=voca_size,
                                 hidden_size=hp.hidden_units,
                                 num_hidden_layers=hp.num_blocks,
                                 num_attention_heads=hp.num_heads,
                                 intermediate_size=hp.intermediate_size,
                                 type_vocab_size=hp.type_vocab_size,
                                 )

        seq_length = hp.seq_max
        use_tpu = False
        task = Classification(data_generator.NLI.nli_info.num_classes)
        task2_num_classes = 3

        input_ids = tf.placeholder(tf.int64, [None, seq_length])
        input_mask = tf.placeholder(tf.int64, [None, seq_length])
        segment_ids = tf.placeholder(tf.int64, [None, seq_length])
        label_ids = tf.placeholder(tf.int64, [None])
        if method in [0,1,3,4,5,6]:
            self.rf_mask = tf.placeholder(tf.float32, [None, seq_length])
        elif method in [2]:
            self.rf_mask = tf.placeholder(tf.int32, [None, seq_length])

        self.x_list = [input_ids, input_mask, segment_ids]
        self.y = label_ids
        self.y1 = tf.placeholder(tf.int64, [None], name="y1")
        self.y2 = tf.placeholder(tf.int64, [None], name="y2")
        self.f_loc1 = tf.placeholder(tf.int64, [None], name="f_loc1")
        self.f_loc2 = tf.placeholder(tf.int64, [None], name="f_loc2")

        use_one_hot_embeddings = use_tpu
        self.model = bert.BertModel(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            input_mask=input_mask,
            token_type_ids=segment_ids,
            use_one_hot_embeddings=use_one_hot_embeddings)

        pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True)

        self.logits = task.logits
        self.sout = tf.nn.softmax(self.logits)
        self.pred = pred
        self.loss = loss
        self.acc = task.acc
        #tf.summary.scalar('loss', self.loss)
        #tf.summary.scalar('acc', self.acc)

        enc = self.model.get_sequence_output() # [Batch, Seq_len, hidden_dim]

        logits_raw = tf.layers.dense(enc, 3) # [Batch, seq_len, 3]
        def select(logits, f_loc):
            mask = tf.reshape(tf.one_hot(f_loc, seq_length), [-1,seq_length, 1]) # [Batch, seq_len, 1]
            t = tf.reduce_sum(logits * mask, axis=1)
            return t

        logits1 = select(logits_raw, self.f_loc1) # [Batch, 3]
        logits2 = select(logits_raw, self.f_loc2)  # [Batch, 3]
        self.logits1 = logits1
        self.logits2 = logits2
        label1 = tf.one_hot(self.y1, task2_num_classes) # [Batch, num_class]
        label2 = tf.one_hot(self.y2, task2_num_classes)
        losses1_arr = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=logits1,
            labels=label1)

        losses2_arr = tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=logits2,
            labels=label2)

        self.loss_paired = tf.reduce_mean(losses1_arr) #+ tf.reduce_mean(losses2_arr)