Esempio n. 1
0
class Classify(TaskBase):
    def __init__(self, conf):
        super(Classify, self).__init__(conf)
        self.task_type = 'classify'
        self.conf = conf
        self.read_data()
        self.num_class = len(set(self.label_list))
        self.num_output = self.num_class
        logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" %
                     self.num_class)
        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def read_data(self):
        self.pre = Preprocess()
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep="\t",
                          error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        for idx, text in enumerate(self.text_list):
            self.text_list[idx] = self.pre.get_dl_input_by_text(text)
            if len(self.text_list[idx]) == 0:
                logging.error("find blank lines in %s" % idx)
        self.data_type = 'column_2'

    def create_model_fn(self):
        def cal_loss(pred, labels, batch_size, conf):
            loss = get_loss(type=self.loss_type,
                            logits=pred,
                            labels=labels,
                            labels_sparse=True,
                            **conf)
            return loss

        def model_fn(features, labels, mode, params):
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()

            #############  encoder  #################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                self.embed_query = self.embedding(features=features,
                                                  name='x_query')
                out = self.encoder(self.embed_query,
                                   name='x_query',
                                   features=features)
            else:
                out = self.encoder(features=features)
            #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class))
            pred = tf.nn.softmax(out)
            pred_labels = tf.argmax(pred, axis=-1)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'encode': out,
                    'logit': pred,
                    'label': features['label']
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)

            ############### loss ##################
            loss = cal_loss(pred, labels, self.batch_size, self.conf)

            ############### train ##################
            if mode == tf.estimator.ModeKeys.TRAIN:
                return self.train_estimator_spec(mode, loss, global_step,
                                                 params)

            ############### eval ##################
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {
                    "accuracy":
                    tf.metrics.accuracy(labels=labels, predictions=pred_labels)
                }
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()

        def train_input_fn():
            size = self.num_class
            num_classes_per_batch = self.num_class_per_batch
            assert num_classes_per_batch <= self.num_class, \
                "num_classes_per_batch is %s > %s"%(num_classes_per_batch, self.num_class)
            num_sentences_per_class = self.batch_size // num_classes_per_batch

            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith('train')
            ]
            if len(filenames) == 0:
                logging.warn(
                    "Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [
                    os.path.join(self.tfrecords_path, item)
                    for item in os.listdir(self.tfrecords_path)
                    if item.startswith('train')
                ]
            assert size == len(filenames), "each file represent one class"
            logging.info("tfrecords train class num: {}".format(
                len(filenames)))
            logging.info("tfrecords num_sentences_per_class:{}".format(
                num_sentences_per_class))
            logging.info("tfrecords num_classes_per_batch:{}".format(
                num_classes_per_batch))
            datasets = [
                tf.data.TFRecordDataset(filename) for filename in filenames
            ]
            datasets = [dataset.repeat() for dataset in datasets]

            #assert self.batch_size == num_sentences_per_class* num_classes_per_batch
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(
                generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(
                datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4 * self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            #test
            #sess = tf.Session()
            #features,label = sess.run([features,label])
            #features['x_query_pred'] = [item.decode('utf-8') for item in
            #                           features['x_query_pred'][1]]
            return features, label

        def test_input_fn(mode):
            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith(mode)
            ]
            assert self.num_class == len(
                filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda: test_input_fn("test")
        elif mode == 'dev':
            return lambda: test_input_fn("dev")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {'is_training': True, 'keep_prob': 0.7}
        estimator = self.get_train_estimator(self.create_model_fn(), params)
        estimator.train(input_fn=self.create_input_fn("train"),
                        max_steps=self.max_steps)
        self.save()

    def test(self, mode='test'):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        if mode == 'dev':
            estimator.evaluate(input_fn=self.create_input_fn('dev'))
        elif mode == 'test':
            estimator.evaluate(input_fn=self.create_input_fn('test'))
        else:
            raise ValueError("unknown mode:[%s]" % mode)

    def save(self):
        params = {'is_training': False, 'keep_prob': 1}

        def get_features():
            features = {
                'x_query':
                tf.placeholder(dtype=tf.int64,
                               shape=[None, self.maxlen],
                               name='x_query'),
                'x_query_length':
                tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='x_query_length'),
                'label':
                tf.placeholder(dtype=tf.int64, shape=[None], name='label')
            }
            features.update(self.encoder.features)
            return features

        self.save_model(self.create_model_fn(), params, get_features)
Esempio n. 2
0
class Classify(object):
    def __init__(self, conf):
        self.task_type = 'classify'
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.pre = Preprocess()
        self.model_loaded = False
        self.zdy = {}
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep=",",
                          error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        self.num_class = len(set(self.label_list))
        self.num_output = self.num_class
        logging.info(
            f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<")
        for idx, text in enumerate(self.text_list):
            self.text_list[idx] = self.pre.get_dl_input_by_text(text)
            if len(self.text_list[idx]) == 0:
                logging.error(f"find blank lines in {idx}")

        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def init_embedding(self):
        self.vocab_dict = embedding[self.embedding_type].build_dict(\
                                            dict_path = self.dict_path,
                                            text_list = self.text_list,
                                            mode = self.mode)
        self.embedding = embedding[self.embedding_type](
            text_list=self.text_list,
            vocab_dict=self.vocab_dict,
            dict_path=self.dict_path,
            random=self.rand_embedding,
            maxlen=self.maxlen,
            batch_size=self.batch_size,
            embedding_size=self.embedding_size,
            conf=self.conf)

    def prepare(self):
        self.init_embedding()
        self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
        self.gt.process(self.text_list, self.label_list,
                        self.embedding.text2id, self.encoder.encoder_fun,
                        self.vocab_dict, self.tfrecords_path, self.label_path,
                        self.test_size)
        logging.info("tfrecords generated!")

    def cal_loss(self, pred, labels, batch_size, conf):
        loss = get_loss(type=self.loss_type,
                        logits=pred,
                        labels=labels,
                        labels_sparse=True,
                        **conf)
        return loss

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            ########### embedding #################
            if not self.use_language_model:
                self.init_embedding()
                self.embed_query = self.embedding(features=features,
                                                  name='x_query')
            else:
                self.embedding = None
            #############  encoder  #################
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()
            if not self.use_language_model:
                out = self.encoder(self.embed_query,
                                   name='x_query',
                                   features=features)
            else:
                out = self.encoder(features=features)
            #pred = tf.nn.softmax(tf.layers.dense(out, self.num_class))
            pred = tf.nn.softmax(out)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'encode': out,
                    'logit': pred,
                    'label': features['label']
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)

            ############### loss ##################
            loss = self.cal_loss(pred, labels, self.batch_size, self.conf)

            ############### train ##################
            if mode == tf.estimator.ModeKeys.TRAIN:
                if self.use_clr:
                    self.learning_rate = cyclic_learning_rate(
                        global_step=global_step,
                        learning_rate=self.learning_rate,
                        mode=self.clr_mode)
                optimizer = get_train_op(global_step,
                                         self.optimizer_type,
                                         loss,
                                         self.learning_rate,
                                         clip_grad=5)
                return tf.estimator.EstimatorSpec(mode,
                                                  loss=loss,
                                                  train_op=optimizer)

            ############### eval ##################
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {}
                #{"accuracy": tf.metrics.accuracy(
                #    labels=labels, predictions=predictions["classes"])}
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()

        def train_input_fn():
            size = self.num_class
            num_classes_per_batch = self.num_class_per_batch
            assert num_classes_per_batch <= self.num_class, \
                f"num_classes_per_batch is {num_classes_per_batch} > {self.num_class}"
            num_sentences_per_class = self.batch_size // num_classes_per_batch

            filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \
                             for i in range(size)]
            logging.info("tfrecords train class num: {}".format(
                len(filenames)))
            datasets = [
                tf.data.TFRecordDataset(filename) for filename in filenames
            ]
            datasets = [dataset.repeat() for dataset in datasets]

            #assert self.batch_size == num_sentences_per_class* num_classes_per_batch
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(
                generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(
                datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4 * self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            #test
            #sess = tf.Session()
            #features,label = sess.run([features,label])
            #features['x_query_pred'] = [item.decode('utf-8') for item in
            #                           features['x_query_pred'][1]]
            return features, label

        def test_input_fn(mode):
            filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \
                             for i in range(self.num_class)]
            assert self.num_class == len(
                filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda: test_input_fn("test")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {'is_training': True, 'keep_prob': 0.5}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        estimator.train(input_fn=self.create_input_fn("train"),
                        max_steps=self.max_steps)
        self.save()

    def save(self):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)

        def serving_input_receiver_fn():
            features = {
                'x_query':
                tf.placeholder(dtype=tf.int64,
                               shape=[None, self.maxlen],
                               name='x_query'),
                'x_query_length':
                tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='x_query_length'),
                'label':
                tf.placeholder(dtype=tf.int64, shape=[None], name='label')
            }
            features.update(self.encoder.features)
            return tf.estimator.export.ServingInputReceiver(features, features)

        estimator.export_savedmodel(
            self.export_dir_path,  # 目录
            serving_input_receiver_fn,  # 返回ServingInputReceiver的函数
            assets_extra=None,
            as_text=False,
            checkpoint_path=None)

    def test(self):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        predictions = estimator.predict(input_fn=self.create_input_fn("test"))
        predictions = list(predictions)
        scores = [item['logit'] for item in predictions]
        labels = [item['label'] for item in predictions]
        max_scores = np.max(scores, axis=-1)
        max_ids = np.argmax(scores, axis=-1)
        res = np.equal(labels, max_ids)
        right = len(list(filter(lambda x: x == True, res)))
        sum = len(res)
        print("Acc:{}".format(float(right) / sum))
Esempio n. 3
0
class NER(object):
    def __init__(self, conf):
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.task_type = 'ner'
        self.clip_grad = 5.0
        self.optimizer_type = self.optimizer_type
        self.label2tag = {
            self.tag2label[item]: item
            for item in self.tag2label
        }
        self.shuffle = True

        self.is_training = tf.placeholder(tf.bool, [], name="is_training")
        self.global_step = tf.Variable(0, trainable=False)
        self.keep_prob = tf.where(self.is_training, 0.5, 1.0)

        self.pre = Preprocess()
        self.text_list, self.label_list = load_ner_data(self.train_path)
        if self.maxlen == -1:
            self.maxlen = max([len(text.split()) for text in self.text_list])
        self.trans_label_list(self.label_list, self.tag2label)

        self.text_list = [
            self.pre.get_dl_input_by_text(text) for text in self.text_list
        ]

        if not self.use_language_model:
            #build vocabulary map using training data
            self.vocab_dict = embedding[self.embedding_type].build_dict(
                dict_path=self.dict_path, text_list=self.text_list)

            #define embedding object by embedding_type
            self.embedding = embedding[self.embedding_type](
                text_list=self.text_list,
                vocab_dict=self.vocab_dict,
                dict_path=self.dict_path,
                random=self.rand_embedding,
                batch_size=self.batch_size,
                maxlen=self.maxlen,
                embedding_size=self.embedding_size,
                conf=self.conf)
            self.embed = self.embedding(name='x')
        else:
            self.embedding = None
        self.labels = tf.placeholder(tf.int32,
                                     shape=[None, None],
                                     name="labels")
        self.sequence_lengths = tf.placeholder(tf.int32,
                                               shape=[None],
                                               name="sequence_lengths")

        #model params
        params = conf
        params.update({
            "maxlen": self.maxlen,
            "embedding_size": self.embedding_size,
            "keep_prob": self.keep_prob,
            "is_training": self.is_training,
            "batch_size": self.batch_size,
            "num_output": self.num_class
        })

        self.encoder = encoder[self.encoder_type](**params)
        if not self.use_language_model:
            self.out = self.encoder(self.embed, 'query', middle_flag=True)
        else:
            self.out = self.encoder()
        self.output_nodes = self.out.name.split(':')[0]
        self.loss(self.out)
        self.optimizer = get_train_op(self.global_step, self.optimizer_type,
                                      self.loss, self.clip_grad,
                                      self.learning_rate)
        #self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        if self.use_language_model:
            tvars = tf.trainable_variables()
            init_checkpoint = conf['init_checkpoint_path']
            (assignment_map,
             initialized_variable_names) = get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    def loss(self, out):
        out_shape = tf.shape(out)
        self.logits = tf.reshape(out, [-1, out_shape[1], self.num_class])
        if not self.use_crf:
            self.labels_softmax_ = tf.argmax(self.logits, axis=-1)
            self.labels_softmax_ = tf.cast(self.labels_softmax_, tf.int32)
        if self.use_crf:
            log_likelihood, self.transition_params = crf_log_likelihood(
                inputs=self.logits,
                tag_indices=self.labels,
                sequence_lengths=self.sequence_lengths)
            self.loss = -tf.reduce_mean(log_likelihood)

        else:
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.logits, labels=self.labels)
            mask = tf.sequence_mask(self.sequence_lengths)
            losses = tf.boolean_mask(losses, mask)
            self.loss = tf.reduce_mean(losses)

        tf.summary.scalar("loss", self.loss)

    def trans_label_list(self, label_list, tag2label):
        for idx, labels in enumerate(label_list):
            for idy, label in enumerate(labels):
                label_list[idx][idy] = tag2label[label_list[idx][idy]]

    def demo_one(self, sess, sent):
        label_list = []
        batches = batch_iter(sent,
                             self.batch_size,
                             self.epoch_num,
                             shuffle=False)
        for batch in batches:
            seqs, labels = zip(*batch)
            label_list_, _ = self.predict_one_batch(sess, seqs)
            label_list.extend(label_list_)
        label2tag = {}
        for tag, label in self.tag2label.items():
            label2tag[label] = tag if label != 0 else label
        tag = [label2tag[label] for label in label_list[0]]
        return tag

    def train(self):
        train_data = zip(self.text_list, self.label_list)
        batches = batch_iter(train_data,
                             self.batch_size,
                             self.epoch_num,
                             shuffle=True)

        max_acc = -1
        for step, batch in enumerate(batches):
            x_batch, labels = zip(*batch)
            sys.stdout.write(' processing: {}.'.format(step + 1) + '\r')
            step_num = step + 1

            if not self.use_language_model:
                _, x_batch, len_batch = self.embedding.text2id(
                    x_batch,
                    self.vocab_dict,
                    self.maxlen,
                    need_preprocess=False)
                feed_dict = {self.sequence_lengths: len_batch}
                feed_dict[self.labels], _ = self.embedding.pad_sequences(
                    labels)
                feed_dict.update(self.embedding.feed_dict(x_batch, 'x'))
                feed_dict.update(self.encoder.feed_dict(query=len_batch))
            else:
                feed_dict = {}
                feed_dict.update(self.encoder.feed_dict(x_batch))

            _, loss_train, step_num_ = self.sess.run(
                [self.optimizer, self.loss, self.global_step],
                feed_dict=feed_dict)
            if step_num % (self.valid_step / 10) == 0:
                logging.info('step {}, loss: {:.4}'.format(\
                    step_num,
                    loss_train))
            if step_num % (self.valid_step) == 0:
                logging.info('===========validation / test===========')
                result = self.test()
                logging.info("result:", result)
                if result['acc'] > max_acc:
                    max_acc = result['acc']
                    self.saver.save(self.sess,
                                    "{0}/{1}.ckpt".format(
                                        self.checkpoint_path, self.task_type),
                                    global_step=step)
                    write_pb(self.checkpoint_path, self.model_path,
                             ["is_training", self.output_nodes])
                else:
                    self.save_pb()
                    logging.info(f'train finished! accuracy: {max_acc}')
                    sys.exit(0)

    def test(self):
        #saver = tf.train.Saver()
        #with tf.Session() as sess:
        #    logging.info('=========== testing ===========')
        #    saver.restore(sess, self.model_path)
        #    label_list, seq_len_list = self.dev_one_epoch(sess, test)
        #    self.evaluate(label_list, seq_len_list, test)

        self.raw_dev_text_list, self.dev_label_list = load_ner_data(
            self.test_path)
        #self.raw_dev_text_list, self.dev_label_list = \
        #    self.raw_dev_text_list[:50], self.dev_label_list[:50]
        self.dev_text_list = [self.pre.get_dl_input_by_text(text) for \
                              text in self.raw_dev_text_list]
        self.trans_label_list(self.dev_label_list, self.tag2label)
        dev_data = zip(self.dev_text_list, self.dev_label_list)
        out_label_list, seq_len_list = self.dev_one_epoch(self.sess, dev_data)
        result = self.evaluate(self.dev_label_list, out_label_list, \
                               self.raw_dev_text_list, seq_len_list)
        return result

    def dev_one_epoch(self, sess, dev):
        """

        :param sess:
        :param dev:
        :return:
        """
        label_list, seq_len_list = [], []
        batches = batch_iter(dev,
                             self.batch_size,
                             self.epoch_num,
                             shuffle=False)
        for batch in batches:
            seqs, labels = zip(*batch)
            label_list_, seq_len_list_ = self.predict_one_batch(sess, seqs)
            label_list.extend(label_list_)
            seq_len_list.extend(seq_len_list_)
        return label_list, seq_len_list

    def predict_one_batch(self, sess, seqs):
        """

        :param sess:
        :param seqs:
        :return: label_list
                 seq_len_list
        """
        if self.use_language_model:
            _, x_batch, len_batch = self.embedding.text2id(
                seqs, self.vocab_dict, self.maxlen, need_preprocess=False)
            feed_dict = {self.sequence_lengths: len_batch}
            feed_dict.update(self.embedding.feed_dict(x_batch, 'x'))
            feed_dict.update(self.encoder.feed_dict(query=len_batch))
        else:
            feed_dict.update(self.encoder.feed_dict(x_batch))

        if self.use_crf:
            logits, transition_params = sess.run(
                [self.logits, self.transition_params], feed_dict=feed_dict)
            label_list = []
            for logit, seq_len in zip(logits, len_batch):
                viterbi_seq, _ = viterbi_decode(logit[:seq_len],
                                                transition_params)
                label_list.append(viterbi_seq)
            return label_list, len_batch

        else:
            label_list = sess.run(self.labels_softmax_, feed_dict=feed_dict)
            return label_list, len_batch

    #def evaluate(self, label_list, seq_len_list, data, epoch=None):
    def evaluate(self, dev_label_list, out_label_list, raw_dev_text_list, \
                 seq_len_list):
        model_predict = []
        for label, label_pred, sent, seq_len in zip(dev_label_list,
                                                    out_label_list,
                                                    raw_dev_text_list,
                                                    seq_len_list):
            sent = sent.split()
            sent_res = []
            for idx in range(seq_len):
                sent_res.append([sent[idx], label[idx], label_pred[idx]])
            model_predict.append(sent_res)

        accs = []
        correct_preds, total_correct, total_preds = 0., 0., 0.
        for item in model_predict:
            sent = [i[0] for i in item]
            lab = [i[1] for i in item]
            lab_pred = [i[2] for i in item]
            accs += [a == b for (a, b) in zip(lab, lab_pred)]
            lab_chunks = set(get_chunks(lab, self.tag2label))
            lab_pred_chunks = set(get_chunks(lab_pred, self.tag2label))

            correct_preds += len(lab_chunks & lab_pred_chunks)
            total_preds += len(lab_pred_chunks)
            total_correct += len(lab_chunks)
        p = correct_preds / total_preds if correct_preds > 0 else 0
        r = correct_preds / total_correct if correct_preds > 0 else 0
        f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0
        acc = np.mean(accs)
        return {"acc": 100 * acc, "f1": 100 * f1}
Esempio n. 4
0
class NER(TaskBase):
    def __init__(self, conf):
        super(NER, self).__init__(conf)
        self.task_type = 'ner'
        self.conf = conf
        self.read_data()
        if self.maxlen == -1:
            self.maxlen = max([len(text.split()) for text in self.text_list])
        #model params
        params = conf
        params.update({
            "maxlen":self.maxlen,
            "embedding_size":self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_class,
            "keep_prob": 1,
            "is_training": False,
        })

        #params['num_output'] = 128
        #self.encoder_base = encoder['transformer'](**params)
        #params['num_output'] = self.num_class
        self.encoder = encoder[self.encoder_type](**params)


    def read_data(self):
        self.pre = Preprocess()
        self.util = NERUtil()
        self.text_list, self.label_list = self.util.load_ner_data(self.ori_path)
        self.text_list = [self.pre.get_dl_input_by_text(text, self.use_generalization) for text in self.text_list]
        self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list))))
        self.data_type = 'column_2'

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            if mode == tf.estimator.ModeKeys.TRAIN:
                self.encoder.keep_prob = 0.5
                self.encoder.is_training = True
            else:
                self.encoder.keep_prob = 1
                self.encoder.is_training = False

            seq_len = features['x_query_length']
            global_step = tf.train.get_or_create_global_step()

            ################ encode ##################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                embed = self.embedding(features = features, name = 'x_query')
                out = self.encoder(embed, 'x_query', features = features, middle_flag = True)
                #out = self.encoder_base(embed, 'x_query', features = features, middle_flag = True)
                #out = self.encoder(out, 'x_query', features = features, middle_flag = True)
            else:
                out = self.encoder(features = features)

            logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class])

            batch_size = get_placeholder_batch_size(logits)
            small = -1000
            start_logits = tf.concat([
                small*tf.ones(shape=[batch_size, 1, self.num_class]), 
                tf.zeros(shape=[batch_size, 1, 1])],
                                     axis=-1)
            pad_logits = tf.cast(small * tf.ones(shape=[batch_size, self.maxlen,
                                                        1]), tf.float32)
            logits = tf.concat([logits, pad_logits], axis = -1)
            logits = tf.concat([start_logits, logits], axis = 1)
            seq_len += 1
            transition_params = tf.get_variable('crf', 
                                         [self.num_class + 1,self.num_class + 1], 
                                         dtype=tf.float32)
            pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'logit': logits,
                    'pred_ids': pred_ids,
                }
                return tf.estimator.EstimatorSpec(mode, predictions=predictions)
            else:
                ############### loss ####################
                labels = tf.concat([
                    tf.cast(self.num_class * tf.ones(shape=[batch_size, 1]), tf.int64), 
                    labels
                ], axis = -1)
                log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, 
                                                                      labels,
                                                                      seq_len,
                                                                      transition_params)
                loss = -tf.reduce_mean(log_likelihood)
                if mode == tf.estimator.ModeKeys.TRAIN:
                    return self.train_estimator_spec(mode, loss, global_step, params)
                if mode == tf.estimator.ModeKeys.EVAL:
                    weights = tf.sequence_mask(seq_len, self.maxlen+1)
                    metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)}
                    return tf.estimator.EstimatorSpec(mode, 
                                                      loss=loss, 
                                                      eval_metric_ops=metrics)
        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()
        def train_input_fn():
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith('train')]
            if len(filenames) == 0:
                logging.warn("Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [os.path.join(self.tfrecords_path,item) for item in 
                             os.listdir(self.tfrecords_path) if item.startswith('train')]
            dataset = tf.data.TFRecordDataset(filenames)
            dataset = dataset.repeat()

            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.shuffle(buffer_size=100*self.batch_size)
            dataset = dataset.prefetch(4*self.batch_size)
            dataset = dataset.batch(self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        def test_input_fn(mode):
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith(mode)]
            assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda : test_input_fn("test")
        elif mode == 'dev':
            return lambda : test_input_fn("dev")
        else:
            raise ValueError("unknown input_fn type!")

    def save(self):
        def get_features():
            features = {'x_query': tf.placeholder(dtype=tf.int64, 
                                                  shape=[None, self.maxlen],
                                                  name='x_query'),
                        'x_query_length': tf.placeholder(dtype=tf.int64,
                                                         shape=[None],
                                                         name='x_query_length'),
                        }
            features.update(self.encoder.get_features())
            return features
        self.save_model(self.create_model_fn(), None, get_features)

    def train(self):
        estimator = self.get_train_estimator(self.create_model_fn(), None)
        estimator.train(input_fn = self.create_input_fn("train"), max_steps =
                        self.max_steps)
        self.save()

    def test(self, mode = 'test'):
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config)
        if mode == 'dev':
            estimator.evaluate(input_fn=self.create_input_fn('dev'))
        elif mode == 'test':
            estimator.evaluate(input_fn=self.create_input_fn('test'))
        else:
            raise ValueError("unknown mode:[%s]"%mode)

    def train_and_evaluate(self):
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path,
                                        save_checkpoints_steps=self.save_interval,
                                        keep_checkpoint_max=5)

        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config)

        early_stop = tf.estimator.experimental.stop_if_no_decrease_hook(
            estimator=estimator,
            metric_name="loss",
            max_steps_without_decrease=estimator.config.save_checkpoints_steps * 2,
            run_every_secs=None,
            run_every_steps=estimator.config.save_checkpoints_steps,
        )

        train_spec=tf.estimator.TrainSpec(
                             input_fn = self.create_input_fn("train"), 
                             max_steps = self.max_steps,
                             hooks=[early_stop])

        eval_spec=tf.estimator.EvalSpec(
                             input_fn = self.create_input_fn("dev"),
                             steps = None,
                             start_delay_secs = 1, # start evaluating after N seconds
                             throttle_secs = 10,  # evaluate every N seconds
                             )
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
        self.save()
Esempio n. 5
0
class Match(TaskBase):
    def __init__(self, conf):
        super(Match, self).__init__(conf)
        self.task_type = 'match'
        self.conf = conf
        self.read_data()
        self.num_class = len(set(self.label_list))
        logging.info(">>>>>>>>>>>> class num:%s <<<<<<<<<<<<<<<" %
                     self.num_class)
        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def read_data(self):
        self.pre = Preprocess()
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep="\t",
                          error_bad_lines=False)
        if 'text' in csv.keys() and 'target' in csv.keys():
            #format: text \t target
            #for this format, the size for each class should be larger than 2
            self.text_list = list(csv['text'])
            self.label_list = list(csv['target'])
            self.data_type = 'column_2'
        elif 'text_a' in csv.keys() and 'text_b' in csv.keys(
        ) and 'target' in csv.keys():
            #format: text_a \t text_b \t target
            #for this format, target value can only be choosen from 0 or 1
            self.text_a_list = list(csv['text_a'])
            self.text_b_list = list(csv['text_b'])
            self.text_list = self.text_a_list + self.text_b_list
            self.label_list = list(csv['target'])
            self.data_type = 'column_3'
        else:
            raise ValueError('error format for train file')
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in \
                          self.text_list]

    def create_model_fn(self):
        def cal_loss(pred, labels, batch_size, conf):
            if self.tfrecords_mode == 'class':
                pos_scores, neg_scores = batch_hard_triplet_scores(
                    labels, pred,
                    is_distance=self.is_distance)  # pos/neg scores
                pos_scores = tf.squeeze(pos_scores, -1)
                neg_scores = tf.squeeze(neg_scores, -1)
                #for represent,
                #     pred is a batch of tensors which size >1
                #     we can use triplet loss(hinge loss) or contrastive loss
                #if use hinge loss, we don't need labels
                #if use other loss(contrastive loss), we need define pos/neg target before
                if self.loss_type in ['hinge_loss', 'improved_triplet_loss']:
                    #pairwise
                    loss = get_loss(type=self.loss_type,
                                    pos_logits=pos_scores,
                                    neg_logits=neg_scores,
                                    **conf)
                else:
                    #pointwise
                    pos_target = tf.ones(shape=[int(self.batch_size)],
                                         dtype=tf.float32)
                    neg_target = tf.zeros(shape=[int(self.batch_size)],
                                          dtype=tf.float32)

                    pos_loss = get_loss(type=self.loss_type,
                                        logits=pos_scores,
                                        labels=pos_target,
                                        **conf)
                    neg_loss = get_loss(type=self.loss_type,
                                        logits=neg_scores,
                                        labels=neg_target,
                                        **conf)
                    loss = pos_loss + neg_loss

            elif self.tfrecords_mode in ['pair', 'point']:
                if self.loss_type in ['hinge_loss', 'improved_triplet_loss']:
                    assert self.tfrecords_mode == 'pair', "only pair mode can provide <query, pos, neg> format data"
                    #pairwise
                    if self.num_output == 1:
                        pred = tf.nn.sigmoid(pred)
                    elif self.num_output == 2:
                        pred = tf.nn.softmax(pred)[:, 0]
                        pred = tf.expand_dims(pred, -1)
                    else:
                        raise ValueError(
                            'unsupported num_output, 1(sigmoid) or 2(softmax)?'
                        )
                    pos_scores = tf.strided_slice(pred, [0], [batch_size], [2])
                    neg_scores = tf.strided_slice(pred, [1], [batch_size], [2])
                    loss = get_loss(type=self.loss_type,
                                    pos_logits=pos_scores,
                                    neg_logits=neg_scores,
                                    **conf)
                elif self.loss_type in ['sigmoid_loss']:
                    #pointwise
                    labels = tf.expand_dims(labels, axis=-1)
                    loss = get_loss(type=self.loss_type,
                                    logits=pred,
                                    labels=labels,
                                    **conf)
                else:
                    raise ValueError('unsupported loss for pair/point match')
            else:
                raise ValueError('unknown tfrecords_mode?')
            return loss

        def model_fn(features, labels, mode, params):
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()

            ############# encode #################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                if self.tfrecords_mode == 'class':
                    self.embed_query = self.embedding(features=features,
                                                      name='x_query')
                    output = self.encoder(self.embed_query,
                                          name='x_query',
                                          features=features)
                    output = tf.nn.l2_normalize(output, -1)

                elif self.tfrecords_mode in ['pair', 'point']:
                    if self.sim_mode == 'cross':
                        self.embed_query = self.embedding(features=features,
                                                          name='x_query')
                        self.embed_sample = self.embedding(features=features,
                                                           name='x_sample')
                        output = self.encoder(x_query=self.embed_query,
                                              x_sample=self.embed_sample,
                                              features=features)
                    elif self.sim_mode == 'represent':
                        self.embed_query = self.embedding(features=features,
                                                          name='x_query')
                        self.embed_sample = self.embedding(features=features,
                                                           name='x_sample')
                        query_encode = self.encoder(self.embed_query,
                                                    name='x_query',
                                                    features=features)
                        sample_encode = self.encoder(self.embed_sample,
                                                     name='x_sample',
                                                     features=features)
                        output = self.concat(query_encode, sample_encode)
                        output = tf.layers.dense(output,
                                                 1,
                                                 kernel_regularizer=tf.contrib.
                                                 layers.l2_regularizer(0.001),
                                                 name='fc')
                    else:
                        raise ValueError(
                            'unknown sim_mode, represent or cross')
            else:
                output = self.encoder(features=features)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                #pdb.set_trace()
                predictions = {
                    'encode':
                    output,
                    'pred':
                    tf.cast(tf.greater(tf.nn.softmax(output)[:, 0], 0.5),
                            tf.int32) if self.num_output == 2 else
                    tf.cast(tf.greater(tf.nn.sigmoid(output), 0.5), tf.int32),
                    'score':
                    tf.nn.softmax(output)[:, 0]
                    if self.num_output == 2 else tf.nn.sigmoid(output),
                    'label':
                    features['label']
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)

            ############### loss ##################
            loss = cal_loss(output, labels, self.batch_size, self.conf)

            ############### train ##################
            if mode == tf.estimator.ModeKeys.TRAIN:
                return self.train_estimator_spec(mode, loss, global_step,
                                                 params)
            ############### eval ##################
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {}
                #{"accuracy": tf.metrics.accuracy(
                #    labels=labels, predictions=predictions["classes"])}
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()

        def train_input_fn():
            if self.tfrecords_mode == 'class':
                #size = self.num_class
                num_classes_per_batch = 32
                assert num_classes_per_batch < self.num_class
                num_sentences_per_class = self.batch_size // num_classes_per_batch
            elif self.tfrecords_mode == 'pair':
                #data order: query,pos,query,neg
                num_sentences_per_class = 4
                num_classes_per_batch = self.batch_size // num_sentences_per_class
            elif self.tfrecords_mode == 'point':
                #data order: query, sample(pos or neg)
                num_classes_per_batch = 2
                num_sentences_per_class = self.batch_size // num_classes_per_batch
            else:
                raise ValueError('unknown tfrecords_mode')

            #filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \
            #                 for i in range(size)]
            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith('train')
            ]
            if len(filenames) == 0:
                logging.warn(
                    "Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [
                    os.path.join(self.tfrecords_path, item)
                    for item in os.listdir(self.tfrecords_path)
                    if item.startswith('train')
                ]
            size = len(filenames)
            logging.info("tfrecords train class num: {}".format(size))
            datasets = [
                tf.data.TFRecordDataset(filename) for filename in filenames
            ]
            datasets = [dataset.repeat() for dataset in datasets]

            #datasets = [dataset.shuffle(buffer_size=1000) for dataset in datasets]
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(
                generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(
                datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4 * self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            ##test
            #pdb.set_trace()
            #sess = tf.Session()
            #features1,label1 = sess.run([features,label])
            #features1['x_query_pred'] = [item.decode('utf-8') for item in features1['x_query_pred'][1]]
            #features1['x_sample_pred'] = [item.decode('utf-8') for item in features1['x_sample_pred'][1]]
            return features, label

        def test_input_fn(mode):
            #filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \
            #                 for i in range(self.num_class * self.dev_size)]
            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith(mode)
            ]
            assert self.num_class == len(
                filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda: test_input_fn("test")
        elif mode == 'dev':
            return lambda: test_input_fn("dev")
        elif mode == 'label':
            return lambda: test_input_fn("train")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {
            'is_training': True,
            'keep_prob': 0.7,
        }
        estimator = self.get_train_estimator(self.create_model_fn(), params)
        estimator.train(input_fn=self.create_input_fn("train"),
                        max_steps=self.max_steps)

    def save(self):
        params = {'is_training': False, 'keep_prob': 1}

        def get_features():
            features = {
                'x_query':
                tf.placeholder(dtype=tf.int64,
                               shape=[None, self.maxlen],
                               name='x_query'),
                'x_query_length':
                tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='x_query_length'),
                'label':
                tf.placeholder(dtype=tf.int64, shape=[None], name='label')
            }
            if self.tfrecords_mode in ['pair', 'point']:
                features.update({
                    'x_sample':
                    tf.placeholder(dtype=tf.int64,
                                   shape=[None, self.maxlen],
                                   name='x_sample'),
                    'x_sample_length':
                    tf.placeholder(dtype=tf.int64,
                                   shape=[None],
                                   name='x_sample_length')
                })
            features.update(self.encoder.get_features())
            return features

        self.save_model(self.create_model_fn(), params, get_features)

    def test(self, mode='test'):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        predictions = estimator.predict(input_fn=self.create_input_fn(mode))
        predictions = list(predictions)

        if self.tfrecords_mode == 'class':
            predictions_vec = [item['encode'] for item in predictions]
            predictions_label = [item['label'] for item in predictions]
            refers = estimator.predict(input_fn=self.create_input_fn("label"))
            refers = list(refers)

            refers_vec = [item['encode'] for item in refers]
            refers_label = [item['label'] for item in refers]

            right = 0
            thre_right = 0
            sum = 0

            if self.is_distance:
                scores = euclidean_distances(predictions_vec, refers_vec)
                selected_ids = np.argmin(scores, axis=-1)
            else:
                scores = cosine_similarity(predictions_vec, refers_vec)
                selected_ids = np.argmax(scores, axis=-1)
            for idx, item in enumerate(selected_ids):
                if refers_label[item] == predictions_label[idx]:
                    if self.is_distance:
                        if 1 - scores[idx][item] > self.score_thre:
                            thre_right += 1
                    else:
                        if scores[idx][item] > self.score_thre:
                            thre_right += 1
                    right += 1
                sum += 1
            print("Acc:{}".format(float(right) / sum))
            print("ThreAcc:{}".format(float(thre_right) / sum))
        elif self.tfrecords_mode == 'pair':
            #对于pair方式的评估
            scores = [item['score'] for item in predictions]
            labels = [item['label'] for item in predictions]
            #pdb.set_trace()

            #predictions
            scores = np.reshape(scores, [self.num_class * self.dev_size, -1])
            pred_max_ids = np.argmax(scores, axis=-1)
            #label
            labels = np.reshape(labels, [self.num_class, -1])

            right = 0
            for idx, max_id in enumerate(pred_max_ids):
                if labels[idx][max_id] == 1:
                    right += 1
            sum = len(pred_max_ids)
            print("Acc:{}".format(float(right) / sum))

        elif self.tfrecords_mode == 'point':
            scores = [item['score'] for item in predictions]
            scores = np.reshape(scores, -1)
            scores = [0 if item < self.score_thre else 1 for item in scores]
            #pred = [item['pred'] for item in predictions]
            labels = [item['label'] for item in predictions]
            res = metrics(labels=labels, logits=np.array(scores))
            print("precision:{} recall:{} f1:{}".format(
                res[3], res[4], res[5]))

    def concat(self, a, b):
        tmp = tf.concat([a, b], axis=-1)
        #return tmp
        res1 = a * b
        res2 = a + b
        res3 = a - b
        return tf.concat([tmp, res1, res2, res3], axis=-1)

    def knn(self, scores, predictions_label, refers_label, k=4):
        sorted_id = np.argsort(-scores, axis=-1)
        shape = np.shape(sorted_id)
        max_id = []
        for idx in range(shape[0]):
            mp = defaultdict(int)
            for idy in range(k):
                mp[refers_label[int(sorted_id[idx][idy])]] += 1
            max_id.append(max(mp, key=mp.get))
        return max_id
Esempio n. 6
0
class Match(object):
    def __init__(self, conf):
        self.task_type = 'match'
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.graph = tf.get_default_graph()
        self.pre = Preprocess()
        self.model_loaded = False
        self.zdy = {}
        csv = pd.read_csv(self.ori_path, header = 0, sep=",", error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        self.num_class = len(set(self.label_list))
        logging.info(f">>>>>>>>>>>>>>class num:{self.num_class}")
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in \
                          self.text_list]
        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def init_embedding(self):
        self.vocab_dict = embedding[self.embedding_type].build_dict(\
                                            dict_path = self.dict_path,
                                            text_list = self.text_list,
                                            mode = self.mode)
        self.embedding = embedding[self.embedding_type](text_list = self.text_list,
                                                        vocab_dict = self.vocab_dict,
                                                        dict_path = self.dict_path,
                                                        random=self.rand_embedding,
                                                        maxlen = self.maxlen,
                                                        batch_size = self.batch_size,
                                                        embedding_size =
                                                        self.embedding_size,
                                                        conf = self.conf)

    def prepare(self):
        self.init_embedding()
        self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
        self.gt.process(self.text_list, self.label_list, self.embedding.text2id,
                        self.encoder.encoder_fun, self.vocab_dict,
                        self.tfrecords_path, self.label_path)

    def cal_loss(self, pred, labels, pos_target, neg_target, batch_size, conf):
        if self.loss_type == 'hinge_loss':
            if self.sub_loss_type == 'all':
                loss = batch_all_triplet_loss(labels, pred, conf['margin'])
            else:
                loss = batch_hard_triplet_loss(labels, pred, conf['margin'])
        else:
            loss = get_loss(type = self.loss_type, logits = pred, labels =
                                labels, **conf)
        return loss

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            if not self.use_language_model:
                self.init_embedding()
                if self.tfrecords_mode == 'class':
                    self.embed_query = self.embedding(features = features, name = 'x_query')
                else:
                    self.embed_query = self.embedding(features = features, name = 'x_query')
                    self.embed_sample = self.embedding(features = features, name = 'x_sample')
            else:
                self.embedding = None
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()
            if self.sim_mode == 'cross':
                if not self.use_language_model:
                    pred = self.encoder(x_query = self.embed_query, 
                                        x_sample = self.embed_sample,
                                        features = features)
                else:
                    pred = self.encoder(features = features)
            elif self.sim_mode == 'represent':
                if not self.use_language_model:
                    #features['x_query_length'] = features['length']
                    pred = self.encoder(self.embed_query, 
                                                     name = 'x_query', 
                                                     features = features)
                else:
                    pred = self.encoder(features = features)
            else:
                raise ValueError('unknown sim mode')

            pos_target = tf.ones(shape = [int(self.batch_size/2)], dtype = tf.float32)
            neg_target = tf.zeros(shape = [int(self.batch_size/2)], dtype = tf.float32)
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'pred': pred,
                    'label': features['label']
                }
                return tf.estimator.EstimatorSpec(mode, predictions=predictions)
            loss = self.cal_loss(pred,
                             labels,
                             pos_target,
                             neg_target,
                             self.batch_size,
                             self.conf)
            if mode == tf.estimator.ModeKeys.TRAIN:
                if self.use_clr:
                    self.learning_rate = cyclic_learning_rate(global_step=global_step,
                                                          learning_rate = self.learning_rate, 
                                                          mode = self.clr_mode)
                optimizer = get_train_op(global_step, 
                                         self.optimizer_type, 
                                         loss,
                                         self.learning_rate, 
                                         clip_grad = 5)
                return tf.estimator.EstimatorSpec(mode, loss = loss,
                                                      train_op=optimizer)
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {}
                #{"accuracy": tf.metrics.accuracy(
                #    labels=labels, predictions=predictions["classes"])}
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()
        def train_input_fn():
            if self.tfrecords_mode == 'pair':
                size = self.num_pair
                num_classes_per_batch = 2
                num_sentences_per_class = self.batch_size // num_classes_per_batch
            else:
                size = self.num_class
                num_classes_per_batch = 16
                num_sentences_per_class = self.batch_size // num_classes_per_batch

            filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \
                             for i in range(size)]
            logging.info("tfrecords train class num: {}".format(len(filenames)))
            datasets = [tf.data.TFRecordDataset(filename) for filename in filenames]
            datasets = [dataset.repeat() for dataset in datasets]
            #assert self.batch_size == num_sentences_per_class* num_classes_per_batch
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4*self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            #test
            #sess = tf.Session()
            #features,label = sess.run([features,label])
            #features['x_query_pred'] = [item.decode('utf-8') for item in
            #                           features['x_query_pred'][1]]
            return features, label

        def test_input_fn(mode):
            filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \
                             for i in range(self.num_class)]
            assert self.num_class == len(filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda : test_input_fn("test")
        elif mode == 'label':
            return lambda : test_input_fn("train")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {
            'is_training': True,
            'keep_prob': 0.5
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        estimator.train(input_fn = self.create_input_fn("train"), max_steps =
                        self.max_steps)
        self.save()

    def save(self):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        def serving_input_receiver_fn():
            x_query = tf.placeholder(dtype=tf.int64, shape=[None, self.maxlen],
                                   name='x_query')
            length = tf.placeholder(dtype=tf.int64, shape=[None], name='x_query_length')
            label = tf.placeholder(dtype=tf.int64, shape=[None], name='label')

            receiver_tensors = {'x_query': x_query, 'x_query_length': length, 'label': label}
            features = {'x_query': x_query, 'x_query_length': length, 'label': label}
            return tf.estimator.export.ServingInputReceiver(receiver_tensors,
                                                            features)
        estimator.export_savedmodel(
            self.export_dir_path, # 目录
            serving_input_receiver_fn, # 返回ServingInputReceiver的函数
            assets_extra=None,
            as_text=False,
            checkpoint_path=None)

    def test(self):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        predictions = estimator.predict(input_fn=self.create_input_fn("test"))
        predictions = list(predictions)
        predictions_vec = [item['pred'] for item in predictions]
        predictions_label = [item['label'] for item in predictions]
        if self.tfrecords_mode == 'class':
            refers = estimator.predict(input_fn=self.create_input_fn("label"))
            refers = list(refers) 

            refers_vec = [item['pred'] for item in refers]
            refers_label = [item['label'] for item in refers]

            right = 0
            thre_right = 0
            sum = 0
            scores = cosine_similarity(predictions_vec, refers_vec)
            max_id = np.argmax(scores, axis=-1)
            #max_id = self.knn(scores, predictions_label, refers_label)
            for idx, item in enumerate(max_id):
                if refers_label[item] == predictions_label[idx]:
                    if scores[idx][item] > self.score_thre:
                        thre_right += 1
                    right += 1
                sum += 1
            print("Acc:{}".format(float(right)/sum))
            print("ThreAcc:{}".format(float(thre_right)/sum))
        else:
            #TODO: 对于pair方式的评估
            pdb.set_trace()

    def knn(self, scores, predictions_label, refers_label, k = 4):
        sorted_id = np.argsort(-scores, axis = -1)
        shape = np.shape(sorted_id)
        max_id = []
        for idx in range(shape[0]):
            mp = defaultdict(int)
            for idy in range(k):
                mp[refers_label[int(sorted_id[idx][idy])]] += 1
            max_id.append(max(mp,key=mp.get))
        return max_id

    def test_unit(self, text):
        #######################init#########################
        if self.model_loaded == False:
            #添加不参与训练样本
            if os.path.exists(self.no_train_path):
                csv = pd.read_csv(self.no_train_path, header = 0, sep=",", error_bad_lines=False)
                self.text_list += list(csv['text'])
                self.label_list += list(csv['target'])
            subdirs = [x for x in Path(self.export_dir_path).iterdir()
                    if x.is_dir() and 'temp' not in str(x)]
            latest = str(sorted(subdirs)[-1])
            self.predict_fn = predictor.from_saved_model(latest)
            self.init_embedding()
            self.model_loaded = True
            self.vec_list = self._get_vecs(self.predict_fn, self.text_list)
            #self.set_zdy_labels(['睡觉','我回家了','晚安','娃娃了','周杰伦','自然语言处理'],
            #                    ['打开情景模式','打开情景模式','打开情景模式',
            #                     '打开情景模式','打开情景模式','打开情景模式'])
        text_list = self.text_list
        vec_list = self.vec_list
        label_list = self.label_list

        #用于添加自定义问句(自定义优先)
        if self.zdy != {}:
            text_list = self.zdy['text_list'] + text_list
            vec_list = np.concatenate([self.zdy['vec_list'], self.vec_list], axis = 0)
            label_list = self.zdy['label_list'] + label_list
        vec = self._get_vecs(self.predict_fn, [text], need_preprocess = True)
        scores = cosine_similarity(vec, vec_list)[0]
        max_id = np.argmax(scores)
        max_score = scores[max_id]
        max_similar = text_list[max_id]
        logging.info("test result: {}, {}, {}".format(label_list[max_id], max_score, max_similar))
        return label_list[max_id], max_score, max_id

    def set_zdy_labels(self, text_list, label_list):
        if len(text_list) == 0 or len(label_list) == 0: 
            self.zdy = {}
            return
        self.zdy['text_list'] = text_list
        self.zdy['vec_list'] = self._get_vecs(self.predict_fn, 
                                              text_list,
                                              need_preprocess = True)
        self.zdy['label_list'] = label_list

    def _get_vecs(self, predict_fn, text_list, need_preprocess = False):
        #根据batches数据生成向量
        text_list_pred, x_query, x_query_length = self.embedding.text2id(text_list,
                                                     self.vocab_dict,
                                                     need_preprocess)
        label = [0 for _ in range(len(text_list))]

        predictions = predict_fn({'x_query': x_query, 
                                  'x_query_length': x_query_length, 
                                  'label': label})
        return predictions['pred']
Esempio n. 7
0
class NER(TaskBase):
    def __init__(self, conf):
        super(NER, self).__init__(conf)
        self.task_type = 'ner'
        self.conf = conf
        self.read_data()
        #if self.maxlen == -1:
        #    self.maxlen = max([len(text.split()) for text in self.text_list])
        #model params
        params = conf
        params.update({
            "maxlen":self.maxlen,
            "embedding_size":self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_class,
            "keep_prob": 1,
            "is_training": False,
        })

        self.encoder = encoder[self.encoder_type](**params)

    def read_data(self):
        self.pre = Preprocess()
        self.util = NERUtil()
        self.text_list, self.label_list = self.util.load_ner_data(self.ori_path)
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list]
        self.num_class = self.num_output = len(set(list(chain.from_iterable(self.label_list))))
        self.data_type = 'column_2'

    def create_model_fn(self):
        def model_fn(features, labels, mode, params):
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            seq_len = features['x_query_length']
            global_step = tf.train.get_or_create_global_step()

            ################ encode ##################
            if not self.use_language_model:
                self.embedding, _ = self.init_embedding()
                embed = self.embedding(features = features, name = 'x_query')
                out = self.encoder(embed, 'x_query', features = features, middle_flag = True)
            else:
                out = self.encoder(features = features)
            logits = tf.reshape(out, [-1, int(out.shape[1]), self.num_class])

            transition_params = tf.get_variable('crf', 
                                         [self.num_class,self.num_class], 
                                         dtype=tf.float32)
            pred_ids, _ = tf.contrib.crf.crf_decode(logits, transition_params, seq_len)

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'logit': logits,
                    'pred_ids': pred_ids,
                }
                return tf.estimator.EstimatorSpec(mode, predictions=predictions)
            else:
                ############### loss ####################
                log_likelihood, _ = tf.contrib.crf.crf_log_likelihood(logits, labels,
                                                                      seq_len,
                                                                      transition_params)
                loss = -tf.reduce_mean(log_likelihood)
                if mode == tf.estimator.ModeKeys.TRAIN:
                    return self.train_estimator_spec(mode, loss, global_step, params)
                if mode == tf.estimator.ModeKeys.EVAL:
                    #pdb.set_trace()
                    weights = tf.sequence_mask(seq_len, self.maxlen)
                    metrics = {'acc': tf.metrics.accuracy(labels, pred_ids, weights)}
                    #metrics = {'acc': tf.metrics.accuracy(labels, pred_ids)}
                    return tf.estimator.EstimatorSpec(mode, 
                                                      loss=loss, 
                                                      eval_metric_ops=metrics)
        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()
        def train_input_fn():
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith('train')]
            if len(filenames) == 0:
                logging.warn("Can't find any tfrecords file for train, prepare now!")
                self.prepare()
                filenames = [os.path.join(self.tfrecords_path,item) for item in 
                             os.listdir(self.tfrecords_path) if item.startswith('train')]
            dataset = tf.data.TFRecordDataset(filenames)
            dataset = dataset.repeat()
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4*self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        def test_input_fn(mode):
            filenames = [os.path.join(self.tfrecords_path,item) for item in 
                         os.listdir(self.tfrecords_path) if item.startswith(mode)]
            assert len(filenames) > 0, "Can't find any tfrecords file for %s!"%mode
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(lambda record: gt.parse_record(record, self.encoder),
                                  num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda : test_input_fn("test")
        elif mode == 'dev':
            return lambda : test_input_fn("dev")
        else:
            raise ValueError("unknown input_fn type!")

    def save(self):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        def get_features():
            features = {'x_query': tf.placeholder(dtype=tf.int64, 
                                                  shape=[None, self.maxlen],
                                                  name='x_query'),
                        'x_query_length': tf.placeholder(dtype=tf.int64,
                                                         shape=[None],
                                                         name='x_query_length'),
                        }
                        #'label': tf.placeholder(dtype=tf.int64, 
                        #                        shape=[None],
                        #                        name='label')}
            features.update(self.encoder.get_features())
            return features
        self.save_model(self.create_model_fn(), params, get_features)

    def train(self):
        params = {
            'is_training': True,
            'keep_prob': 0.7
        }
        estimator = self.get_train_estimator(self.create_model_fn(), params)
        estimator.train(input_fn = self.create_input_fn("train"), max_steps =
                        self.max_steps)
        self.save()

    def test(self, mode = 'test'):
        params = {
            'is_training': False,
            'keep_prob': 1
        }
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn = self.create_model_fn(),
                                           config = config,
                                           params = params)
        if mode == 'dev':
            estimator.evaluate(input_fn=self.create_input_fn('dev'))
        elif mode == 'test':
            estimator.evaluate(input_fn=self.create_input_fn('test'))
        else:
            raise ValueError("unknown mode:[%s]"%mode)
Esempio n. 8
0
class Match(object):
    def __init__(self, conf):
        self.task_type = 'match'
        self.conf = conf
        for attr in conf:
            setattr(self, attr, conf[attr])
        self.pre = Preprocess()
        self.model_loaded = False
        self.zdy = {}
        csv = pd.read_csv(self.ori_path,
                          header=0,
                          sep=",",
                          error_bad_lines=False)
        self.text_list = list(csv['text'])
        self.label_list = list(csv['target'])
        self.num_class = len(set(self.label_list))
        logging.info(
            f">>>>>>>>>>>> class num:{self.num_class} <<<<<<<<<<<<<<<")
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in \
                          self.text_list]
        self.conf.update({
            "maxlen": self.maxlen,
            "maxlen1": self.maxlen,
            "maxlen2": self.maxlen,
            "num_class": self.num_class,
            "embedding_size": self.embedding_size,
            "batch_size": self.batch_size,
            "num_output": self.num_output,
            "keep_prob": 1,
            "is_training": False,
        })
        self.encoder = encoder[self.encoder_type](**self.conf)

    def prepare(self):
        vocab_dict = embedding[self.embedding_type].build_dict(\
                                            dict_path = self.dict_path,
                                            text_list = self.text_list,
                                            mode = self.mode)
        text2id = embedding[self.embedding_type].text2id
        self.gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
        self.gt.process(self.text_list, self.label_list, text2id,
                        self.encoder.encoder_fun, vocab_dict,
                        self.tfrecords_path, self.label_path, self.test_size)
        logging.info("tfrecords generated!")

    def create_model_fn(self):
        def init_embedding():
            vocab_dict = embedding[self.embedding_type].build_dict(\
                                                dict_path = self.dict_path,
                                                text_list = self.text_list,
                                                mode = self.mode)
            return embedding[self.embedding_type](
                text_list=self.text_list,
                vocab_dict=vocab_dict,
                dict_path=self.dict_path,
                random=self.rand_embedding,
                maxlen=self.maxlen,
                batch_size=self.batch_size,
                embedding_size=self.embedding_size,
                conf=self.conf)

        def cal_loss(pred, labels, batch_size, conf):
            if self.sim_mode == 'represent':
                pos_scores, neg_scores = batch_hard_triplet_scores(
                    labels, pred)  # pos/neg scores
                pos_scores = tf.squeeze(pos_scores, -1)
                neg_scores = tf.squeeze(neg_scores, -1)
                #for represent,
                #     pred is a batch of tensors which size >1
                #     we can use triplet loss(hinge loss) or contrastive loss
                #if use hinge loss, we don't need labels
                #if use other loss(contrastive loss), we need define pos/neg target before
                if self.loss_type == 'hinge_loss':
                    #pairwise
                    loss = get_loss(type=self.loss_type,
                                    pos_logits=pos_scores,
                                    neg_logits=neg_scores,
                                    is_distance=True,
                                    **conf)
                else:
                    #pointwise
                    pos_target = tf.ones(shape=[int(self.batch_size) / 2],
                                         dtype=tf.float32)
                    neg_target = tf.zeros(shape=[int(self.batch_size) / 2],
                                          dtype=tf.float32)

                    pos_loss = get_loss(type=self.loss_type,
                                        logits=pos_scores,
                                        labels=pos_target,
                                        **conf)
                    neg_loss = get_loss(type=self.loss_type,
                                        logits=neg_scores,
                                        labels=neg_target,
                                        **conf)
                    loss = pos_loss + neg_loss

            elif self.sim_mode == 'cross':
                #for cross:
                #   pred is a batch of tensors which size == 1
                #pdb.set_trace()
                if self.loss_type == 'hinge_loss':
                    #pairwise
                    if self.num_output == 1:
                        pred = tf.nn.sigmoid(pred)
                    elif self.num_output == 2:
                        pred = tf.nn.softmax(pred)[:, 0]
                        pred = tf.expand_dims(pred, -1)
                    else:
                        raise ValueError(
                            'unsupported num_output, 1(sigmoid) or 2(softmax)?'
                        )
                    pos_scores = tf.strided_slice(pred, [0], [batch_size], [2])
                    neg_scores = tf.strided_slice(pred, [1], [batch_size], [2])
                    loss = get_loss(type=self.loss_type,
                                    pos_logits=pos_scores,
                                    neg_logits=neg_scores,
                                    is_distance=False,
                                    **conf)
                elif self.loss_type in ['sigmoid_loss']:
                    #pointwise
                    #labels = tf.stack([labels, 1-labels], axis = -1)
                    loss = get_loss(type=self.loss_type,
                                    logits=pred,
                                    labels=labels,
                                    **conf)
                else:
                    raise ValueError('unsupported loss for cross match')
            else:
                raise ValueError('unknown sim mode, cross or represent?')
            return loss

        def model_fn(features, labels, mode, params):
            ############# embedding #################
            if not self.use_language_model:
                self.embedding = init_embedding()
                if self.tfrecords_mode == 'class':
                    self.embed_query = self.embedding(features=features,
                                                      name='x_query')
                else:
                    self.embed_query = self.embedding(features=features,
                                                      name='x_query')
                    self.embed_sample = self.embedding(features=features,
                                                       name='x_sample')
            else:
                self.embedding = None
            #############  encoder  #################
            #model params
            self.encoder.keep_prob = params['keep_prob']
            self.encoder.is_training = params['is_training']
            global_step = tf.train.get_or_create_global_step()
            if self.sim_mode == 'cross':
                if not self.use_language_model:
                    output = self.encoder(x_query=self.embed_query,
                                          x_sample=self.embed_sample,
                                          features=features)
                else:
                    output = self.encoder(features=features)

            elif self.sim_mode == 'represent':
                if not self.use_language_model:
                    #features['x_query_length'] = features['length']
                    output = self.encoder(self.embed_query,
                                          name='x_query',
                                          features=features)
                else:
                    output = self.encoder(features=features)
            else:
                raise ValueError('unknown sim mode')

            ############### predict ##################
            if mode == tf.estimator.ModeKeys.PREDICT:
                #pdb.set_trace()
                predictions = {
                    'encode':
                    output,
                    'pred':
                    tf.cast(tf.greater(tf.nn.softmax(output)[:, 0], 0.5),
                            tf.int32) if self.num_output == 2 else
                    tf.cast(tf.greater(tf.nn.sigmoid(output), 0.5), tf.int32),
                    'score':
                    tf.nn.softmax(output)[:, 0]
                    if self.num_output == 2 else tf.nn.sigmoid(output),
                    'label':
                    features['label']
                }
                return tf.estimator.EstimatorSpec(mode,
                                                  predictions=predictions)
            ############### loss ##################
            loss = cal_loss(output, labels, self.batch_size, self.conf)
            ############### train ##################
            if mode == tf.estimator.ModeKeys.TRAIN:
                if self.use_clr:
                    self.learning_rate = cyclic_learning_rate(
                        global_step=global_step,
                        learning_rate=self.learning_rate,
                        mode=self.clr_mode)
                optimizer = get_train_op(global_step,
                                         self.optimizer_type,
                                         loss,
                                         self.learning_rate,
                                         clip_grad=5)
                return tf.estimator.EstimatorSpec(mode,
                                                  loss=loss,
                                                  train_op=optimizer)
            ############### eval ##################
            if mode == tf.estimator.ModeKeys.EVAL:
                eval_metric_ops = {}
                #{"accuracy": tf.metrics.accuracy(
                #    labels=labels, predictions=predictions["classes"])}
                return tf.estimator.EstimatorSpec(
                    mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

        return model_fn

    def create_input_fn(self, mode):
        n_cpu = multiprocessing.cpu_count()

        def train_input_fn():
            if self.tfrecords_mode == 'pair':
                num_sentences_per_class = 4
                num_classes_per_batch = self.batch_size // num_sentences_per_class
            else:
                #size = self.num_class
                num_classes_per_batch = 16
                num_sentences_per_class = self.batch_size // num_classes_per_batch

            #filenames = ["{}/train_class_{:04d}".format(self.tfrecords_path,i) \
            #                 for i in range(size)]
            filenames = [
                os.path.join(self.tfrecords_path, item)
                for item in os.listdir(self.tfrecords_path)
                if item.startswith('train')
            ]
            size = len(filenames)
            logging.info("tfrecords train class num: {}".format(size))
            datasets = [
                tf.data.TFRecordDataset(filename) for filename in filenames
            ]
            datasets = [dataset.repeat() for dataset in datasets]

            #datasets = [dataset.shuffle(buffer_size=1000) for dataset in datasets]
            def generator():
                while True:
                    labels = np.random.choice(range(size),
                                              num_classes_per_batch,
                                              replace=False)
                    for label in labels:
                        for _ in range(num_sentences_per_class):
                            yield label

            choice_dataset = tf.data.Dataset.from_generator(
                generator, tf.int64)
            dataset = tf.contrib.data.choose_from_datasets(
                datasets, choice_dataset)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(4 * self.batch_size)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            ##test
            #pdb.set_trace()
            #sess = tf.Session()
            #features1,label1 = sess.run([features,label])
            #features1['x_query_pred'] = [item.decode('utf-8') for item in features1['x_query_pred'][1]]
            #features1['x_sample_pred'] = [item.decode('utf-8') for item in features1['x_sample_pred'][1]]
            return features, label

        def test_input_fn(mode):
            filenames = ["{}/{}_class_{:04d}".format(self.tfrecords_path,mode,i) \
                             for i in range(self.num_class * self.test_size)]
            assert self.num_class == len(
                filenames), "the num of tfrecords file error!"
            logging.info("tfrecords test class num: {}".format(len(filenames)))
            dataset = tf.data.TFRecordDataset(filenames)
            gt = GenerateTfrecords(self.tfrecords_mode, self.maxlen)
            dataset = dataset.map(
                lambda record: gt.parse_record(record, self.encoder),
                num_parallel_calls=n_cpu)
            dataset = dataset.batch(self.batch_size)
            dataset = dataset.prefetch(1)
            iterator = dataset.make_one_shot_iterator()
            features, label = iterator.get_next()
            return features, label

        if mode == 'train':
            return train_input_fn
        elif mode == 'test':
            return lambda: test_input_fn("test")
        elif mode == 'label':
            return lambda: test_input_fn("train")
        else:
            raise ValueError("unknown input_fn type!")

    def train(self):
        params = {'is_training': True, 'keep_prob': 0.5}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        estimator.train(input_fn=self.create_input_fn("train"),
                        max_steps=self.max_steps)
        self.save()

    def save(self):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)

        def serving_input_receiver_fn():
            features = {
                'x_query':
                tf.placeholder(dtype=tf.int64,
                               shape=[None, self.maxlen],
                               name='x_query'),
                'x_query_length':
                tf.placeholder(dtype=tf.int64,
                               shape=[None],
                               name='x_query_length'),
                'label':
                tf.placeholder(dtype=tf.int64, shape=[None], name='label')
            }
            if self.tfrecords_mode == 'pair':
                features.update({
                    'x_sample':
                    tf.placeholder(dtype=tf.int64,
                                   shape=[None, self.maxlen],
                                   name='x_sample'),
                    'x_sample_length':
                    tf.placeholder(dtype=tf.int64,
                                   shape=[None],
                                   name='x_sample_length')
                })
            features.update(self.encoder.get_features())
            return tf.estimator.export.ServingInputReceiver(features, features)

        estimator.export_savedmodel(
            self.export_dir_path,  # 目录
            serving_input_receiver_fn,  # 返回ServingInputReceiver的函数
            assets_extra=None,
            as_text=False,
            checkpoint_path=None)

    def test(self):
        params = {'is_training': False, 'keep_prob': 1}
        config = tf.estimator.RunConfig(tf_random_seed=230,
                                        model_dir=self.checkpoint_path)
        estimator = tf.estimator.Estimator(model_fn=self.create_model_fn(),
                                           config=config,
                                           params=params)
        predictions = estimator.predict(input_fn=self.create_input_fn("test"))
        predictions = list(predictions)

        if self.tfrecords_mode == 'class':
            predictions_vec = [item['encode'] for item in predictions]
            predictions_label = [item['label'] for item in predictions]
            refers = estimator.predict(input_fn=self.create_input_fn("label"))
            refers = list(refers)

            refers_vec = [item['encode'] for item in refers]
            refers_label = [item['label'] for item in refers]

            right = 0
            thre_right = 0
            sum = 0
            scores = euclidean_distances(predictions_vec, refers_vec)
            selected_ids = np.argmin(scores, axis=-1)
            for idx, item in enumerate(selected_ids):
                if refers_label[item] == predictions_label[idx]:
                    if scores[idx][item] > self.score_thre:
                        thre_right += 1
                    right += 1
                sum += 1
            print("Acc:{}".format(float(right) / sum))
            print("ThreAcc:{}".format(float(thre_right) / sum))
        else:
            #对于pair方式的评估
            scores = [item['score'] for item in predictions]
            labels = [item['label'] for item in predictions]
            #pdb.set_trace()

            #predictions
            scores = np.reshape(scores, [self.num_class * self.test_size, -1])
            pred_max_ids = np.argmax(scores, axis=-1)
            #label
            labels = np.reshape(labels, [self.num_class, -1])

            right = 0
            for idx, max_id in enumerate(pred_max_ids):
                if labels[idx][max_id] == 1:
                    right += 1
            sum = len(pred_max_ids)
            print("Acc:{}".format(float(right) / sum))

    def knn(self, scores, predictions_label, refers_label, k=4):
        sorted_id = np.argsort(-scores, axis=-1)
        shape = np.shape(sorted_id)
        max_id = []
        for idx in range(shape[0]):
            mp = defaultdict(int)
            for idy in range(k):
                mp[refers_label[int(sorted_id[idx][idy])]] += 1
            max_id.append(max(mp, key=mp.get))
        return max_id
Esempio n. 9
0
class Classify(object):
    def __init__(self, conf):
        self.conf = conf
        self.task_type = 'classify'
        for attr in conf:
            setattr(self, attr, conf[attr])

        self.is_training = tf.placeholder(tf.bool, [], name="is_training")
        self.global_step = tf.Variable(0, trainable=False)
        self.keep_prob = tf.where(self.is_training, 0.5, 1.0)

        self.pre = Preprocess()
        self.text_list, self.label_list = load_classify_data(self.train_path)
        self.text_list = [self.pre.get_dl_input_by_text(text) for text in self.text_list]

        if not self.use_language_model:
            #build vocabulary map using training data
            self.vocab_dict = embedding[self.embedding_type].build_dict(dict_path = self.dict_path, 
                                                                  text_list = self.text_list)

            #define embedding object by embedding_type
            self.embedding = embedding[self.embedding_type](text_list = self.text_list,
                                                            vocab_dict = self.vocab_dict,
                                                            dict_path = self.dict_path,
                                                            random=self.rand_embedding,
                                                            batch_size = self.batch_size,
                                                            maxlen = self.maxlen,
                                                            embedding_size = self.embedding_size,
                                                            conf = self.conf)
            self.embed = self.embedding(name = 'x')
        self.y = tf.placeholder(tf.int32, [None], name="y")

        #model params
        params = conf
        params.update({
            "maxlen":self.maxlen,
            "embedding_size":self.embedding_size,
            "keep_prob":self.keep_prob,
            "batch_size": self.batch_size,
            "num_output": self.num_class,
            "is_training": self.is_training
        })
        self.encoder = encoder[self.encoder_type](**params)

        if not self.use_language_model:
            self.out = self.encoder(self.embed)
        else:
            self.out = self.encoder()
        self.output_nodes = self.out.name.split(':')[0]
        self.loss(self.out)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(tf.global_variables())
        if self.use_language_model:
            tvars = tf.trainable_variables()
            init_checkpoint = conf['init_checkpoint_path']
            (assignment_map, initialized_variable_names) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            tf.train.init_from_checkpoint(init_checkpoint,assignment_map)

    def load_data(self, mode = 'train'):
        logging.info("Building dataset...")
        if mode == 'train':
            class_mp, class_mp_rev = generate_class_mp(self.label_list, self.classes_path)
            y = [class_mp[item] for item in self.label_list]
            train_x, valid_x, train_y, valid_y = \
                train_test_split(self.text_list, y, test_size=0.05)
            return zip(train_x, train_y), zip(valid_x, valid_y)
        else:

            class_mp, class_mp_rev = load_class_mp(self.classes_path)
            text_list, label_list = load_classify_data(self.test_path)
            y = [class_mp[item] for item in label_list]
            return text_list, y

    def loss(self, out):
        with tf.name_scope("output"):
            self.scores = tf.nn.softmax(out, axis=1, name="scores")
            self.predictions = tf.argmax(self.scores, -1, output_type=tf.int32,
                                         name = 'predictions')
        with tf.name_scope("loss"):
            #self.loss = tf.reduce_mean(
            #    tf.nn.sparse_softmax_cross_entropy_with_logits(logits=out, labels=self.y))
            self.loss = get_loss(type = self.loss_type, logits = out, labels =
                                 self.y, labels_sparse = True, **self.conf)
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)

        with tf.name_scope("accuracy"):
            correct_predictions = tf.equal(self.predictions, self.y)
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

    def train(self):
        logging.info("---------start train---------")
        self.train_data, self.valid_data = self.load_data(mode = 'train')
        self.train_data = list(self.train_data)
        self.valid_data = list(self.valid_data)
        train_batches = batch_iter(self.train_data, self.batch_size, self.num_epochs)
        num_batches_per_epoch = (len(self.train_data) - 1) // self.batch_size + 1
        max_accuracy = -1
        for batch in train_batches:
            x_batch, y_batch = zip(*batch)

            train_feed_dict = {
                self.y: y_batch,
                self.is_training: True
            }
            if not self.use_language_model:
                _, x_batch, len_batch = self.embedding.text2id(
                    x_batch, self.vocab_dict, need_preprocess = False)
                train_feed_dict.update(self.embedding.feed_dict(x_batch,'x'))
                train_feed_dict.update(self.encoder.feed_dict(len = len_batch))
            else:
                train_feed_dict.update(self.encoder.feed_dict(x_batch))
            _, step, loss = self.sess.run([self.optimizer, self.global_step, self.loss], feed_dict=train_feed_dict)
            if step % (self.valid_step/10) == 0:
                logging.info("step {0}: loss = {1}".format(step, loss))
            if step % self.valid_step == 0:
                # Test accuracy with validation data for each epoch.
                valid_batches = batch_iter(self.valid_data, self.batch_size, 1, shuffle=False)
                sum_accuracy, cnt = 0, 0
                for valid_batch in valid_batches:

                    valid_x_batch, valid_y_batch = zip(*valid_batch)

                    valid_feed_dict = {
                        self.y: valid_y_batch,
                        self.is_training: False
                    }
                    if not self.use_language_model:
                        _, valid_x_batch, len_batch = self.embedding.text2id(
                            valid_x_batch, self.vocab_dict, need_preprocess = False)
                        valid_feed_dict.update(self.embedding.feed_dict(valid_x_batch,'x'))
                        valid_feed_dict.update(self.encoder.feed_dict(len = len_batch))
                    else:
                        valid_feed_dict.update(self.encoder.feed_dict(valid_x_batch))
                    accuracy = self.sess.run(self.accuracy, feed_dict=valid_feed_dict)
                    sum_accuracy += accuracy
                    cnt += 1
                valid_accuracy = sum_accuracy / cnt
                logging.info("\nValidation Accuracy = {1}\n".format(step // num_batches_per_epoch, sum_accuracy / cnt))
                # Save model
                if valid_accuracy > max_accuracy:
                    max_accuracy = valid_accuracy
                    self.saver.save(self.sess,
                                    "{0}/{1}.ckpt".format(self.checkpoint_path,
                                                              self.task_type),
                                    global_step=step)
                    logging.info("Model is saved.\n")
                else:
                    self.save_pb()
                    logging.info(f"train finished! accuracy: {max_accuracy}")
                    sys.exit(0)

    def save_pb(self):
        write_pb(self.checkpoint_path,
                 self.model_path,
                 ['is_training','output/predictions','accuracy/accuracy',self.output_nodes])

    def test(self):
        if not os.path.exists(self.model_path):
            self.save_pb()
        graph = load_pb(self.model_path)
        sess = tf.Session(graph=graph)

        self.y = graph.get_operation_by_name("y").outputs[0]
        self.is_training = graph.get_operation_by_name("is_training").outputs[0]
        self.accuracy = graph.get_operation_by_name("accuracy/accuracy").outputs[0]

        self.scores = graph.get_tensor_by_name("output/scores:0")
        #self.scores = graph.get_tensor_by_name(self.output_nodes+":0")
        self.predictions = graph.get_tensor_by_name("output/predictions:0")

        mp, mp_rev = load_class_mp(self.classes_path)

        test_x, test_y = self.load_data("test")
        pred_y = []
        scores = []
        batches = batch_iter(zip(test_x, test_y), self.batch_size, 1, shuffle=False)
        sum_accuracy, cnt = 0, 0
        right, all = 0, 0
        vocab_dict = embedding[self.embedding_type].build_dict(self.dict_path,
                                                      mode = 'test')
        all_test_x = []
        all_test_y = []
        for batch in batches:
            batch_x, batch_y = zip(*batch)

            feed_dict = {
                self.y: batch_y,
                self.is_training: False
            }
            if not self.use_language_model:
                preprocess_x, batch_x_id, len_batch = self.embedding.text2id(batch_x, vocab_dict, need_preprocess = True)
                feed_dict.update(self.embedding.pb_feed_dict(graph, batch_x_id, 'x'))
                feed_dict.update(self.encoder.pb_feed_dict(graph, len = len_batch))
            else:
                feed_dict.update(self.encoder.pb_feed_dict(graph, batch_x))
            accuracy_out, predictions_out, scores_out = sess.run([self.accuracy,
                                                                  self.predictions,
                                                                  self.scores],
                                                                 feed_dict=feed_dict)
            max_scores = [scores_out[idx][predictions_out[idx]] \
                          for idx in range(len(predictions_out))]
            sum_accuracy += accuracy_out
            cnt += 1
            pred_y += list(predictions_out)
            scores += list(max_scores)
            all_test_x += list(batch_x)
            all_test_y += list(batch_y)

            for idx in range(len(predictions_out)):
                if predictions_out[idx] == int(batch_y[idx]) and max_scores[idx]> self.thre_score:
                    right += 1
                all += 1
        dt = pd.DataFrame({'text': all_test_x,
                       'target': [mp_rev[int(item)] for item in
                                 all_test_y] ,
                       'pred': [mp_rev[item] for item in
                                pred_y],
                       'score': scores })
        dt.to_csv(self.test_path+'.result.csv',index=False,sep=',')
        logging.info("Test Accuracy : {0}".format(sum_accuracy / cnt))
        logging.info("Test Thre Accuracy : {0}".format(right / all))

    def predict(self):
        predict_file = self.predict_path
        if not os.path.exists(self.model_path):
            self.save_pb()
        graph = load_pb(self.model_path)
        sess = tf.Session(graph=graph)

        self.y = graph.get_operation_by_name("y").outputs[0]
        self.is_training = graph.get_operation_by_name("is_training").outputs[0]

        #self.scores = graph.get_tensor_by_name(self.output_nodes+":0")
        self.scores = graph.get_tensor_by_name("output/scores:0")
        self.predictions = graph.get_tensor_by_name("output/predictions:0")

        vocab_dict = embedding[self.embedding_type].build_dict(self.dict_path,mode = 'test')
        mp, mp_rev = load_class_mp(self.classes_path) 
        with open(predict_file) as f:
            lines = [line.strip() for line in f.readlines()]
            batches = batch_iter(lines, self.batch_size, 1, shuffle=False)
            scores = []
            predicts = []
            for batch_x in batches:
                feed_dict = {
                    self.is_training: False
                }
                if not self.use_language_model:
                    preprocess_x, batch_x, len_batch = self.embedding.text2id(batch_x, vocab_dict)
                    feed_dict.update(self.embedding.pb_feed_dict(graph, batch_x, 'x'))
                    feed_dict.update(self.encoder.pb_feed_dict(graph, len = len_batch))
                else:
                    feed_dict.update(self.encoder.pb_feed_dict(graph, batch_x))
                predictions_out, scores_out = sess.run([self.predictions,
                                                            self.scores],
                                                            feed_dict=feed_dict)
                max_scores = [scores_out[idx][predictions_out[idx]] \
                              for idx in range(len(predictions_out))]

                predicts += list(predictions_out)
                scores += list(max_scores)

            predicts = [mp_rev[item] for item in predicts]

            dt = pd.DataFrame({'text': lines,
                               'pred': predicts,
                               'score': scores })
            dt.to_csv(self.predict_path+'.result.csv',index=False,sep=',')

    def test_unit(self, text):
        if not os.path.exists(self.model_path):
            self.save_pb()
        graph = load_pb(self.model_path)
        sess = tf.Session(graph=graph)

        self.y = graph.get_operation_by_name("y").outputs[0]
        self.is_training = graph.get_operation_by_name("is_training").outputs[0]

        self.scores = graph.get_tensor_by_name("output/scores:0")
        #self.scores = graph.get_tensor_by_name(self.output_nodes+":0")
        self.predictions = graph.get_tensor_by_name("output/predictions:0")

        vocab_dict = embedding[self.embedding_type].build_dict(self.dict_path,mode = 'test')
        mp, mp_rev = load_class_mp(self.classes_path) 
        batches = batch_iter([text], self.batch_size, 1, shuffle=False)
        for batch_x in batches:
            feed_dict = {
                self.is_training: False
            }
            if not self.use_language_model:
                preprocess_x, batch_x, len_batch = self.embedding.text2id(batch_x, vocab_dict)
                feed_dict.update(self.embedding.pb_feed_dict(graph, batch_x, 'x'))
                feed_dict.update(self.encoder.pb_feed_dict(graph, len = len_batch))
            else:
                feed_dict.update(self.encoder.pb_feed_dict(graph, batch_x))
            predictions_out, scores_out = sess.run([self.predictions,
                                                        self.scores],
                                                        feed_dict=feed_dict)
            max_scores = [scores_out[idx][predictions_out[idx]] \
                          for idx in range(len(predictions_out))]
        logging.info("preprocess: {}".format(preprocess_x))
        logging.info("class:{}, score:{}, class_id:{}".format(
            mp_rev[predictions_out[0]],
            max_scores[0],
            predictions_out[0]))
        return mp_rev[predictions_out[0]], max_scores[0]