コード例 #1
0
    def prepare_test_data(self):
        # 读取词汇表
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC':
            self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH))

        # 测试集有标题,读取时注意跳过第一行
        dataset = TextLineDataset(os.path.join('data',preprocess.TEST_PATH))
        dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(self.test_batch_size)

        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        return dataset, next_element
コード例 #2
0
    def test_pipeline(self, num_threads):
        real_fname = os.path.join(self.dataset_path, 'test_real.txt')

        # extract directories
        real_dir, inst_dir = self.real_dir, self.inst_dir

        # count lines
        num_real = count_lines(real_fname)

        # dataset creation
        with tf.name_scope('dataset'):
            real = TextLineDataset(real_fname)

            # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat
            #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1))
            #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio)))

            real = real.shuffle(num_real) # no repetition! .repeat()

            # real data only
            augment = 0 # self.params.get('augment', 0)
            def name2real(name):
                inst = read_instr(os.path.join(inst_dir, name.decode() + '.png'))
                if augment:
                    src_dir = self.params.get('augment_src', 'best')
                    # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG'))
                    full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False)
                    pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt'))
                    if isinstance(src_dir, float):
                        pnts *= src_dir
                    self.params['augment_scale'] = 0.
                    real = random_crop(full, pnts, self.params)
                else:
                    real = read_image(os.path.join(real_dir, '160x160', 'gray', name.decode() + '.jpg'))
                return real, inst, name.decode()
            real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32, tf.string])), num_parallel_calls = num_threads)

            #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real))
            dataset = Dataset.zip({ 'real': real })
            dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches!
            dataset = dataset.prefetch(self.batch_size * 2)
            return dataset
コード例 #3
0
class BiLSTM(object):
    def __init__(self, config):
        self.class_num = config.class_num
        self.unit_num = config.unit_num
        self.vocab_size = config.vocab_size

        self.dense_unit_num = config.dense_unit_num
        self.train_batch_size = config.train_batch_size
        self.valid_batch_size = config.valid_batch_size
        self.test_batch_size = config.test_batch_size

        if config.train_mode == 'CHAR-RANDOM':
            # 文本长度
            self.text_length = preprocess.MAX_CHAR_TEXT_LENGTH
            # 词嵌入维度
            self.embedding_dim = config.embedding_dim

        elif config.train_mode == 'WORD-NON-STATIC':
            self.text_length = preprocess.MAX_WORD_TEXT_LENGTH
            self.embedding_dim = preprocess.vec_dim

        self.train_mode = config.train_mode

        self.input_x = None
        self.input_y = None
        self.labels = None
        self.dropout_keep_prob = None
        self.training = None
        self.embedding_inputs = None
        self.embedding_inputs_expanded = None
        self.loss = None
        self.accuracy = None
        self.prediction = None
        self.vocab = None
        self.vecs_dict = {}
        self.embedding_W = None

    def setBiLSTM(self):
        # 输入层
        self.input_x = tf.placeholder(tf.int32, [None, self.text_length], name="input_x")

        self.labels = tf.placeholder(tf.int32, [None], name="input_y")
        # 把数字标签转为one hot形式
        self.input_y = tf.one_hot(self.labels, self.class_num)
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # 训练时batch_normalization的Training参数应为True,
        # 验证或测试时应为False
        self.training = tf.placeholder(tf.bool, name='training')

        # 词嵌入层
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            if self.train_mode == 'CHAR-RANDOM':
                # 随机初始化的词向量
                W = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_dim], -1.0, 1.0))
            elif self.train_mode == 'WORD-NON-STATIC':
                # 用之前读入的预训练词向量
                W = tf.Variable(self.embedding_W)
            self.embedding_inputs = tf.nn.embedding_lookup(W, self.input_x)

        with tf.name_scope("batch_norm"):
            self.embedding_inputs = tf.layers.batch_normalization(self.embedding_inputs, training=self.training)

        def basic_lstm_cell():
            bcell = tf.nn.rnn_cell.LSTMCell(self.unit_num)
            return tf.nn.rnn_cell.DropoutWrapper(bcell, output_keep_prob=self.dropout_keep_prob)

        with tf.name_scope("RNN"):
            # 双向LSTM网络,每层有units_num个神经元
            # ======================================================================================
            # Bidirection LSTM
            lstm_fw_cell = basic_lstm_cell()  # forward direction cell
            lstm_bw_cell = basic_lstm_cell()  # backward direction cell
            # [batch_size, sequence_length, hidden_size] #creates a dynamic bidirectional recurrent neural network
            output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, self.embedding_inputs,
                                                         dtype=tf.float32)
            # concat output
            output = tf.concat(output, axis=2)  # [batch_size,sequence_length,hidden_size*2]

            rnn_output = output[:, -1, :]  # 取最后一个时序作为输出结果
            # =========================================================================================

        with tf.name_scope("dense"):
            # 全连接层
            # ======================================================================================
            h_full = tf.layers.dense(inputs=rnn_output,
                                     units=self.dense_unit_num,
                                     use_bias=True,
                                     kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                                     bias_initializer=tf.constant_initializer(0.1)
                                     )
            h_full = tf.layers.dropout(h_full, rate=self.dropout_keep_prob)
            h_full = tf.nn.relu(h_full)
            # ==========================================================================================


        # Output layer
        with tf.name_scope('output'):
            score = tf.layers.dense(
                h_full,
                units=self.class_num,
                activation=None,
                use_bias=True,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                bias_initializer=tf.constant_initializer(0.1)
            )
            self.score = tf.multiply(score, 1, name='score')
            self.prediction = tf.argmax(score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses)

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

    def convert_input(self, lines):
        """
        将训练集数据转换为id或词向量表示
        """
        batch_x = []
        batch_y = []
        title = ""
        # 1.id
        for line in lines:
            line_ = line.decode("gbk").strip().split(',')
            title = ''.join(line_[0:-1])    # 逗号前段为标题
            label = ''.join(line_[-1])      # 最后一项为标签
            batch_x.append(preprocess.to_id(title, self.vocab, self.train_mode))
            batch_y.append(label)

        batch_x = np.stack(batch_x)
        return batch_x, batch_y

    def convert_test_input(self, titles):
        """
        将测试集tsv数据转为id或词向量表示
        :param titles:
        :return:
        """
        batch_x = []
        # 1.id
        for title in titles:
            valid_title = title.decode('gb18030').strip('\t')
            batch_x.append(preprocess.to_id(valid_title, self.vocab, self.train_mode))

        batch_x = np.stack(batch_x)
        return batch_x

    def prepare_data(self):
        # Data preparation.
        # =======================================================
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            # 读取词汇表
            self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            # 把预训练词向量的值读到变量中
            self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH))
            self.vecs_dict = preprocess.load_vecs(os.path.join('data', preprocess.SGNS_WORD_PATH))
            self.embedding_W = np.ndarray(shape=[self.vocab_size, self.embedding_dim], dtype=np.float32)
            for word in self.vocab:
                # 第n行对应id为n的词的词向量
                if word not in self.vecs_dict:
                    preprocess.add_word(word, self.vecs_dict)
                self.embedding_W[self.vocab[word]] = self.vecs_dict[word]

        self.dataset = TextLineDataset(os.path.join('data', preprocess.TRAIN_WITH_ID_PATH))
        print('Shuffling dataset...')
        self.dataset = self.dataset.shuffle(preprocess.TOTAL_TRAIN_SIZE)
        # 分割数据集
        # 取前VALID_SIZE个样本给验证集
        valid_dataset = self.dataset.take(preprocess.VALID_SIZE).batch(self.valid_batch_size)
        # 剩下的给训练集
        train_dataset = self.dataset.skip(preprocess.VALID_SIZE).batch(self.train_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        valid_iterator = valid_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        valid_init_op = valid_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_valid_element = valid_iterator.get_next()

        return train_init_op, valid_init_op, next_train_element, next_valid_element
        # =============================================================
        # Date preparation ends.

    def prepare_test_data(self):
        # 读取词汇表
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC':
            self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH))

        # 测试集有标题,读取时注意跳过第一行
        dataset = TextLineDataset(os.path.join('data',preprocess.TEST_PATH))
        dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(self.test_batch_size)

        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        return dataset, next_element
コード例 #4
0
class CNN(object):
    def __init__(self, config):
        self.class_num = config.class_num
        self.img_size = config.img_size
        self.crop_size = config.crop_size
        self.train_batch_size = config.train_batch_size
        self.test_batch_size = config.test_batch_size
        self.test_per_batch = config.test_per_batch

        self.batch_x = ''
        self.batch_y = ''
        self.input_x = ''
        self.labels = ''
        self.input_y = ''
        self.dropout_keep_prob = ''
        self.training = ''
        self.embedding_inputs = ''
        self.embedding_inputs_expanded = ''
        self.loss = ''
        self.accuracy = ''
        self.prediction = ''
        self.vocab = ''

    def _set_input(self):
        # Input layer
        self.input_x = tf.placeholder(tf.float32, [None, self.img_size, self.img_size, 1], name="input_x")
        self.labels = tf.placeholder(tf.int32, [None], name="labels")
        self.input_y = tf.one_hot(self.labels, self.class_num, name='input_y')
        self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

        # 训练时batch_normalization的Training参数应为True,
        # 验证或测试时应为False
        self.training = tf.placeholder(tf.bool, name='training')

        self.input_x_enhanced = tf.map_fn(self.image_enhance, self.input_x)
     
    def _conv(self, input, ksize, stride, filters):
        return tf.layers.conv2d(
            inputs=input,
            filters=filters,
            kernel_size=[ksize, ksize],
            strides=[stride, stride],
            padding='SAME',
            activation=tf.nn.relu,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            bias_initializer=tf.initializers.constant(0.0),
        )

    def _maxpool_2x2(self, input):
        return tf.layers.max_pooling2d(
            inputs=input,
            pool_size=[2, 2],
            strides=[2, 2],
            padding='SAME',
        )

    def _fc(self, input, units, dropout_keep_prob, name=None):
        fc_output = tf.layers.dense(
            inputs=input,
            units=units,
            activation=tf.nn.relu,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            bias_initializer=tf.initializers.constant(0.0),
            kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001),
            bias_regularizer=tf.contrib.layers.l2_regularizer(0.001),
            name=name
            )
        return tf.layers.dropout(fc_output, dropout_keep_prob)

    def setVGG19(self):
        """
        在此函数中设定模型
        :return:
        """
        self._set_input()
        # 加入批标准化以减少过拟合
        input_x_norm = tf.layers.batch_normalization(self.input_x_enhanced, training=self.training)

        # conv3-64
        conv3_64_1 = self._conv(input_x_norm, 3, 1, 64)
        conv3_64_output = self._conv(conv3_64_1, 3, 1, 64)

        # maxpool-1
        maxpool_1_output = self._maxpool_2x2(conv3_64_output)

        # conv3-128
        conv3_128_1 = self._conv(maxpool_1_output, 3, 1, 128)
        conv3_128_output = self._conv(conv3_128_1, 3, 1, 128)

        # maxpool-2
        maxpool_2_output = self._maxpool_2x2(conv3_128_output)

        # conv3-256
        conv3_256_1 = self._conv(maxpool_2_output, 3, 1, 256)
        conv3_256_2 = self._conv(conv3_256_1, 3, 1, 256)
        conv3_256_3 = self._conv(conv3_256_2, 3, 1, 256)
        conv3_256_output = self._conv(conv3_256_3, 3, 1, 256)

        # maxpool-3
        maxpool_3_output = self._maxpool_2x2(conv3_256_output)

        # conv3-512
        conv3_512_1 = self._conv(maxpool_3_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_3 = self._conv(conv3_512_2, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_3, 3, 1, 512)

        # maxpool-4
        maxpool_4_output = self._maxpool_2x2(conv3_512_output)

        # conv4-512
        conv3_512_1 = self._conv(maxpool_4_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_3 = self._conv(conv3_512_2, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_3, 3, 1, 512)

        # maxpool-5
        maxpool_5_output = self._maxpool_2x2(conv3_512_output)

        # flatten
        shape = maxpool_5_output.shape.as_list()
        dims = shape[1]*shape[2]*shape[3]
        maxpool_5_output_flatten = tf.reshape(maxpool_5_output, [-1, dims])

        # fully-connected-1
        fc_1 = self._fc(maxpool_5_output_flatten, 2048, self.dropout_keep_prob)

        # fully-connected-2
        fc_2 = self._fc(fc_1, 2048, self.dropout_keep_prob)

        # fully-connected-3
        fc_3 = self._fc(fc_2, 1000, self.dropout_keep_prob)

        # 输出层
        self.score = self._fc(fc_3, self.class_num, self.dropout_keep_prob, name='score')

        self.prediction = tf.argmax(self.score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + tf.losses.get_regularization_loss()

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

    def convert_input(self, lines):
        """
        把读取的字符串数据转为形状为[batch_size, img_size, img_size, 1]的数组,
        作为CNN的输入

        :param pixels:
        :param labels:
        :return:
        """
        batch_x = []
        batch_y = []
        for line in lines:
            line_ = line.decode('utf-8').strip().split(',')
            pixels = line_[1].split()  # 像素值
            label = line_[0]  # 第一项为标签
            batch_x.append([float(x) for x in pixels])
            batch_y.append(int(label))
        batch_x = np.stack(batch_x)
        batch_x = batch_x.reshape([-1, self.img_size, self.img_size, 1])
        batch_y = np.asarray(batch_y)

        return batch_x, batch_y

    def prepare_data(self):
        self.train_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TRAIN_PATH)).skip(1)
        self.test_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TEST_PATH)).skip(1)

        print('Shuffling dataset...')
        # 打乱数据
        train_dataset = self.train_dataset.shuffle(5000).batch(self.train_batch_size)
        test_dataset = self.test_dataset.shuffle(2500).batch(self.test_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        test_iterator = test_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        test_init_op = test_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_test_element = test_iterator.get_next()

        return train_init_op, test_init_op, next_train_element, next_test_element
        # ==============================================================

    def image_enhance(self, image):
        """
        对图片进行数据增强
        :param image: shape为[heigth, width, 1]的numpy数组或tensor
        :return:
        """
        # 进行随机裁剪
        images_crop = tf.image.random_crop(image, [self.crop_size, self.crop_size, 1])

        # 随机水平翻转
        images_crop = tf.image.random_flip_left_right(images_crop)
        # 随机对比度
        images_crop = tf.image.random_contrast(images_crop, 0.5, 1.5)
        # 随机亮度
        images_crop = tf.image.random_brightness(images_crop, max_delta=0.5)

        noise = tf.random_normal(shape=tf.shape(images_crop), mean=0.0, stddev=10.0,
                                 dtype=tf.float32)
        images_crop = tf.add(images_crop, noise)

        return images_crop

    def setVGG16(self):
        """
        在此函数中设定模型
        :return:
        """
        self._set_input()
        # 加入批标准化以减少过拟合
        input_x_norm = tf.layers.batch_normalization(self.input_x_enhanced, training=self.training)

        # conv3-64
        conv3_64_1 = self._conv(input_x_norm, 3, 1, 64)
        conv3_64_output = self._conv(conv3_64_1, 3, 1, 64)

        # maxpool-1
        maxpool_1_output = self._maxpool_2x2(conv3_64_output)

        # conv3-128
        conv3_128_1 = self._conv(maxpool_1_output, 3, 1, 128)
        conv3_128_output = self._conv(conv3_128_1, 3, 1, 128)

        # maxpool-2
        maxpool_2_output = self._maxpool_2x2(conv3_128_output)

        # conv3-256
        conv3_256_1 = self._conv(maxpool_2_output, 3, 1, 256)
        conv3_256_2 = self._conv(conv3_256_1, 3, 1, 256)
        conv3_256_output = self._conv(conv3_256_2, 1, 1, 256)

        # maxpool-3
        maxpool_3_output = self._maxpool_2x2(conv3_256_output)

        # conv3-512
        conv3_512_1 = self._conv(maxpool_3_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_2, 1, 1, 512)

        # maxpool-4
        maxpool_4_output = self._maxpool_2x2(conv3_512_output)

        # conv4-512
        conv3_512_1 = self._conv(maxpool_4_output, 3, 1, 512)
        conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512)
        conv3_512_output = self._conv(conv3_512_2, 1, 1, 512)

        # maxpool-5
        maxpool_5_output = self._maxpool_2x2(conv3_512_output)

        # flatten
        shape = maxpool_5_output.shape.as_list()
        dims = shape[1]*shape[2]*shape[3]
        maxpool_5_output_flatten = tf.reshape(maxpool_5_output, [-1, dims])

        # fully-connected-1
        fc_1 = self._fc(maxpool_5_output_flatten, 2048, self.dropout_keep_prob)

        # fully-connected-2
        fc_2 = self._fc(fc_1, 2048, self.dropout_keep_prob)

        # fully-connected-3
        fc_3 = self._fc(fc_2, 1000, self.dropout_keep_prob)

        # 输出层
        self.score = self._fc(fc_3, self.class_num, self.dropout_keep_prob, name='score')

        self.prediction = tf.argmax(self.score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses) + tf.losses.get_regularization_loss()

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
コード例 #5
0
class TextCNN(object):
    def __init__(self, config):
        self.class_num = config.class_num
        self.filter_sizes = config.filter_sizes
        self.filter_num = config.filter_num
        self.vocab_size = config.vocab_size

        self.dense_unit_num = config.dense_unit_num
        self.train_batch_size = config.train_batch_size
        self.valid_batch_size = config.valid_batch_size
        self.test_batch_size = config.test_batch_size

        if config.train_mode == 'CHAR-RANDOM':
            # 文本长度
            self.text_length = preprocess.MAX_CHAR_TEXT_LENGTH
            # 词嵌入维度
            self.embedding_dim = config.embedding_dim

        elif config.train_mode == 'WORD-NON-STATIC' or config.train_mode == 'MULTI':
            self.text_length = preprocess.MAX_WORD_TEXT_LENGTH
            self.embedding_dim = preprocess.vec_dim

        self.train_mode = config.train_mode

        self.input_x = None
        self.input_y = None
        self.labels = None
        self.dropout_keep_prob = None
        self.training = None
        self.embedding_inputs_expanded = None
        self.loss = None
        self.accuracy = None
        self.prediction = None
        self.vocab = None
        self.vecs_dict = {}
        self.embedding_W = None
        self.dataset = None

    def setCNN(self):
        # 输入层
        self.input_x = tf.placeholder(tf.int32, [None, self.text_length],
                                      name="input_x")
        self.labels = tf.placeholder(tf.int32, [None], name="input_y")
        # 把数字标签转为one hot形式
        self.input_y = tf.one_hot(self.labels, self.class_num)
        self.dropout_keep_prob = tf.placeholder(tf.float32,
                                                name="dropout_keep_prob")

        # 训练时batch_normalization的Training参数应为True,
        # 验证或测试时应为False
        self.training = tf.placeholder(tf.bool, name='training')

        # 词嵌入层
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            if self.train_mode == 'CHAR-RANDOM':
                # 随机初始化的词向量
                W = tf.Variable(
                    tf.random_uniform([self.vocab_size, self.embedding_dim],
                                      -1.0, 1.0))
                embedding_inputs = tf.nn.embedding_lookup(W, self.input_x)
                self.embedding_inputs_expanded = tf.expand_dims(
                    embedding_inputs, -1)
            elif self.train_mode == 'WORD-NON-STATIC':
                # 用之前读入的预训练词向量
                W = tf.Variable(self.embedding_W)
                embedding_inputs = tf.nn.embedding_lookup(W, self.input_x)
                self.embedding_inputs_expanded = tf.expand_dims(
                    embedding_inputs, -1)
            elif self.train_mode == 'MULTI':
                W1 = tf.Variable(self.embedding_W)
                W2 = tf.Variable(self.embedding_W, trainable=False)
                embedding_inputs1 = tf.nn.embedding_lookup(W1, self.input_x)
                embedding_inputs2 = tf.nn.embedding_lookup(W2, self.input_x)
                self.embedding_inputs_expanded = tf.stack(
                    [embedding_inputs1, embedding_inputs2], axis=-1)

        # The final pooling output, containing outputs from each filter
        pool_outputs = []
        # Iterate to create convolution layer for each filter
        for filter_size in self.filter_sizes:
            with tf.name_scope("conv-maxpool-%d" % filter_size):
                # Convolution layer 1
                # ==================================================================
                filter_shape = [filter_size, self.embedding_dim]

                conv_1 = tf.layers.conv2d(
                    inputs=self.embedding_inputs_expanded,
                    filters=self.filter_num,
                    kernel_size=filter_shape,
                    strides=[1, 1],
                    padding='VALID',
                    use_bias=True,
                    kernel_initializer=tf.initializers.truncated_normal(
                        stddev=0.1),
                    bias_initializer=tf.initializers.constant(0.1))
                # ===================================================================
                # Do batch normalization
                # =================================================================
                conv_1_output = tf.layers.batch_normalization(
                    conv_1, training=self.training)
                conv_1_output = tf.nn.relu(conv_1_output)
                # ======================================================================
                # Pooling layer 1
                # ====================================================================
                conv_1_output_shape = conv_1_output.shape.as_list()
                pool_1 = tf.layers.max_pooling2d(
                    inputs=conv_1_output,
                    pool_size=[conv_1_output_shape[1] - 1 + 1, 1],
                    strides=[1, 1],
                    padding='VALID')
                # =====================================================================

            pool_outputs.append(pool_1)

        # Combine all the pooling output
        # The total number of filters.
        total_filter_num = self.filter_num * len(self.filter_sizes)
        h_pool = tf.concat(pool_outputs, 3)
        h_pool_flat = tf.reshape(h_pool, [-1, total_filter_num])
        # Output shape[batch, total_filter_num]

        # Full-connected layer
        # ========================================================================
        with tf.name_scope('dense-%d' % self.dense_unit_num):
            h_full = tf.layers.dense(
                h_pool_flat,
                units=self.dense_unit_num,
                use_bias=True,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                bias_initializer=tf.constant_initializer(0.1))
            h_full = tf.layers.dropout(h_full, rate=self.dropout_keep_prob)
            h_full = tf.nn.relu(h_full)
        # =========================================================================

        # Output layer
        with tf.name_scope('output'):
            score = tf.layers.dense(
                h_full,
                units=self.class_num,
                activation=None,
                use_bias=True,
                kernel_initializer=tf.truncated_normal_initializer(stddev=0.1),
                bias_initializer=tf.constant_initializer(0.1))
            self.score = tf.multiply(score, 1, name='score')
            self.prediction = tf.argmax(score, 1, name='prediction')

        # Loss function
        with tf.name_scope('loss'):
            losses = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=score, labels=self.input_y)
            self.loss = tf.reduce_mean(losses)

        # Calculate accuracy
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.prediction,
                                           tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(
                tf.cast(correct_predictions, tf.float32))

    def convert_input(self, lines):
        """
        将训练集数据转换为id或词向量表示
        """
        batch_x = []
        batch_y = []
        title = ""

        for line in lines:
            line_ = line.decode("gbk").strip().split(',')
            title = ''.join(line_[0:-1])  # 逗号前段为标题
            label = ''.join(line_[-1])  # 最后一项为标签
            batch_x.append(preprocess.to_id(title, self.vocab,
                                            self.train_mode))
            batch_y.append(label)

        batch_x = np.stack(batch_x)
        return batch_x, batch_y

    def convert_test_input(self, titles):
        """
        将测试集tsv数据转为id或词向量表示
        :param titles:
        :return:
        """
        batch_x = []
        # 1.id
        for title in titles:
            valid_title = title.decode('gb18030').strip('\t')
            batch_x.append(
                preprocess.to_id(valid_title, self.vocab, self.train_mode))

        batch_x = np.stack(batch_x)
        return batch_x

    def prepare_data(self):
        # Data preparation.
        # =======================================================
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            # 读取词汇表
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            # 把预训练词向量的值读到变量中
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.WORD_VOCAB_PATH))
            self.vecs_dict = preprocess.load_vecs(
                os.path.join('data', preprocess.SGNS_WORD_PATH))
            self.embedding_W = np.ndarray(
                shape=[self.vocab_size, self.embedding_dim], dtype=np.float32)
            for word in self.vocab:
                # 第n行对应id为n的词的词向量
                if word not in self.vecs_dict:
                    preprocess.add_word(word, self.vecs_dict)
                self.embedding_W[self.vocab[word]] = self.vecs_dict[word]

        self.dataset = TextLineDataset(
            os.path.join('data', preprocess.TRAIN_WITH_ID_PATH))
        print('Shuffling dataset...')
        self.dataset = self.dataset.shuffle(preprocess.TOTAL_TRAIN_SIZE)
        # 分割数据集
        # 取前VALID_SIZE个样本给验证集
        valid_dataset = self.dataset.take(preprocess.VALID_SIZE).batch(
            self.valid_batch_size)
        # 剩下的给训练集
        train_dataset = self.dataset.skip(preprocess.VALID_SIZE).batch(
            self.train_batch_size)

        # Create a reinitializable iterator
        train_iterator = train_dataset.make_initializable_iterator()
        valid_iterator = valid_dataset.make_initializable_iterator()

        train_init_op = train_iterator.initializer
        valid_init_op = valid_iterator.initializer

        # 要获取元素,先sess.run(train_init_op)初始化迭代器
        # 再sess.run(next_train_element)
        next_train_element = train_iterator.get_next()
        next_valid_element = valid_iterator.get_next()

        return train_init_op, valid_init_op, next_train_element, next_valid_element
        # =============================================================
        # Date preparation ends.

    def prepare_test_data(self):
        # 读取词汇表
        if self.train_mode == 'CHAR-RANDOM':
            # 1.字符级
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.CHAR_VOCAB_PATH))

        elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI':
            self.vocab = preprocess.read_vocab(
                os.path.join('data', preprocess.WORD_VOCAB_PATH))

        # 测试集有标题,读取时注意跳过第一行
        dataset = TextLineDataset(os.path.join('data', preprocess.TEST_PATH))
        dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(
            self.test_batch_size)

        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()

        return dataset, next_element