def prepare_test_data(self): # 读取词汇表 if self.train_mode == 'CHAR-RANDOM': # 1.字符级 self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH)) elif self.train_mode == 'WORD-NON-STATIC': self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH)) # 测试集有标题,读取时注意跳过第一行 dataset = TextLineDataset(os.path.join('data',preprocess.TEST_PATH)) dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(self.test_batch_size) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() return dataset, next_element
def test_pipeline(self, num_threads): real_fname = os.path.join(self.dataset_path, 'test_real.txt') # extract directories real_dir, inst_dir = self.real_dir, self.inst_dir # count lines num_real = count_lines(real_fname) # dataset creation with tf.name_scope('dataset'): real = TextLineDataset(real_fname) # @see https://www.tensorflow.org/api_docs/python/tf/contrib/data/shuffle_and_repeat #synt.apply(shuffle_and_repeat(buffer_size = num_synt)) #, count = 1)) #real.apply(shuffle_and_repeat(buffer_size = num_real)) #, count = ceil(ratio))) real = real.shuffle(num_real) # no repetition! .repeat() # real data only augment = 0 # self.params.get('augment', 0) def name2real(name): inst = read_instr(os.path.join(inst_dir, name.decode() + '.png')) if augment: src_dir = self.params.get('augment_src', 'best') # print('{}/{}/{}'.format(real_dir, str(src_dir), name.decode() + '.JPG')) full = read_image(os.path.join(real_dir, str(src_dir), 'rgb', name.decode() + '.jpg'), False) pnts = read_points(os.path.join(real_dir, str(src_dir), 'points', name.decode() + '.txt')) if isinstance(src_dir, float): pnts *= src_dir self.params['augment_scale'] = 0. real = random_crop(full, pnts, self.params) else: real = read_image(os.path.join(real_dir, '160x160', 'gray', name.decode() + '.jpg')) return real, inst, name.decode() real = real.map(lambda name: tuple(tf.py_func(name2real, [name], [tf.float32, tf.int32, tf.string])), num_parallel_calls = num_threads) #dataset = Dataset.zip((rend, xfer, real, inst_synt, inst_real)) dataset = Dataset.zip({ 'real': real }) dataset = dataset.batch(self.batch_size, drop_remainder = True) # we need full batches! dataset = dataset.prefetch(self.batch_size * 2) return dataset
class BiLSTM(object): def __init__(self, config): self.class_num = config.class_num self.unit_num = config.unit_num self.vocab_size = config.vocab_size self.dense_unit_num = config.dense_unit_num self.train_batch_size = config.train_batch_size self.valid_batch_size = config.valid_batch_size self.test_batch_size = config.test_batch_size if config.train_mode == 'CHAR-RANDOM': # 文本长度 self.text_length = preprocess.MAX_CHAR_TEXT_LENGTH # 词嵌入维度 self.embedding_dim = config.embedding_dim elif config.train_mode == 'WORD-NON-STATIC': self.text_length = preprocess.MAX_WORD_TEXT_LENGTH self.embedding_dim = preprocess.vec_dim self.train_mode = config.train_mode self.input_x = None self.input_y = None self.labels = None self.dropout_keep_prob = None self.training = None self.embedding_inputs = None self.embedding_inputs_expanded = None self.loss = None self.accuracy = None self.prediction = None self.vocab = None self.vecs_dict = {} self.embedding_W = None def setBiLSTM(self): # 输入层 self.input_x = tf.placeholder(tf.int32, [None, self.text_length], name="input_x") self.labels = tf.placeholder(tf.int32, [None], name="input_y") # 把数字标签转为one hot形式 self.input_y = tf.one_hot(self.labels, self.class_num) self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # 训练时batch_normalization的Training参数应为True, # 验证或测试时应为False self.training = tf.placeholder(tf.bool, name='training') # 词嵌入层 with tf.device('/cpu:0'), tf.name_scope('embedding'): if self.train_mode == 'CHAR-RANDOM': # 随机初始化的词向量 W = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_dim], -1.0, 1.0)) elif self.train_mode == 'WORD-NON-STATIC': # 用之前读入的预训练词向量 W = tf.Variable(self.embedding_W) self.embedding_inputs = tf.nn.embedding_lookup(W, self.input_x) with tf.name_scope("batch_norm"): self.embedding_inputs = tf.layers.batch_normalization(self.embedding_inputs, training=self.training) def basic_lstm_cell(): bcell = tf.nn.rnn_cell.LSTMCell(self.unit_num) return tf.nn.rnn_cell.DropoutWrapper(bcell, output_keep_prob=self.dropout_keep_prob) with tf.name_scope("RNN"): # 双向LSTM网络,每层有units_num个神经元 # ====================================================================================== # Bidirection LSTM lstm_fw_cell = basic_lstm_cell() # forward direction cell lstm_bw_cell = basic_lstm_cell() # backward direction cell # [batch_size, sequence_length, hidden_size] #creates a dynamic bidirectional recurrent neural network output, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, self.embedding_inputs, dtype=tf.float32) # concat output output = tf.concat(output, axis=2) # [batch_size,sequence_length,hidden_size*2] rnn_output = output[:, -1, :] # 取最后一个时序作为输出结果 # ========================================================================================= with tf.name_scope("dense"): # 全连接层 # ====================================================================================== h_full = tf.layers.dense(inputs=rnn_output, units=self.dense_unit_num, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), bias_initializer=tf.constant_initializer(0.1) ) h_full = tf.layers.dropout(h_full, rate=self.dropout_keep_prob) h_full = tf.nn.relu(h_full) # ========================================================================================== # Output layer with tf.name_scope('output'): score = tf.layers.dense( h_full, units=self.class_num, activation=None, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), bias_initializer=tf.constant_initializer(0.1) ) self.score = tf.multiply(score, 1, name='score') self.prediction = tf.argmax(score, 1, name='prediction') # Loss function with tf.name_scope('loss'): losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=score, labels=self.input_y) self.loss = tf.reduce_mean(losses) # Calculate accuracy with tf.name_scope('accuracy'): correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) def convert_input(self, lines): """ 将训练集数据转换为id或词向量表示 """ batch_x = [] batch_y = [] title = "" # 1.id for line in lines: line_ = line.decode("gbk").strip().split(',') title = ''.join(line_[0:-1]) # 逗号前段为标题 label = ''.join(line_[-1]) # 最后一项为标签 batch_x.append(preprocess.to_id(title, self.vocab, self.train_mode)) batch_y.append(label) batch_x = np.stack(batch_x) return batch_x, batch_y def convert_test_input(self, titles): """ 将测试集tsv数据转为id或词向量表示 :param titles: :return: """ batch_x = [] # 1.id for title in titles: valid_title = title.decode('gb18030').strip('\t') batch_x.append(preprocess.to_id(valid_title, self.vocab, self.train_mode)) batch_x = np.stack(batch_x) return batch_x def prepare_data(self): # Data preparation. # ======================================================= if self.train_mode == 'CHAR-RANDOM': # 1.字符级 # 读取词汇表 self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH)) elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI': # 把预训练词向量的值读到变量中 self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH)) self.vecs_dict = preprocess.load_vecs(os.path.join('data', preprocess.SGNS_WORD_PATH)) self.embedding_W = np.ndarray(shape=[self.vocab_size, self.embedding_dim], dtype=np.float32) for word in self.vocab: # 第n行对应id为n的词的词向量 if word not in self.vecs_dict: preprocess.add_word(word, self.vecs_dict) self.embedding_W[self.vocab[word]] = self.vecs_dict[word] self.dataset = TextLineDataset(os.path.join('data', preprocess.TRAIN_WITH_ID_PATH)) print('Shuffling dataset...') self.dataset = self.dataset.shuffle(preprocess.TOTAL_TRAIN_SIZE) # 分割数据集 # 取前VALID_SIZE个样本给验证集 valid_dataset = self.dataset.take(preprocess.VALID_SIZE).batch(self.valid_batch_size) # 剩下的给训练集 train_dataset = self.dataset.skip(preprocess.VALID_SIZE).batch(self.train_batch_size) # Create a reinitializable iterator train_iterator = train_dataset.make_initializable_iterator() valid_iterator = valid_dataset.make_initializable_iterator() train_init_op = train_iterator.initializer valid_init_op = valid_iterator.initializer # 要获取元素,先sess.run(train_init_op)初始化迭代器 # 再sess.run(next_train_element) next_train_element = train_iterator.get_next() next_valid_element = valid_iterator.get_next() return train_init_op, valid_init_op, next_train_element, next_valid_element # ============================================================= # Date preparation ends. def prepare_test_data(self): # 读取词汇表 if self.train_mode == 'CHAR-RANDOM': # 1.字符级 self.vocab = preprocess.read_vocab(os.path.join('data',preprocess.CHAR_VOCAB_PATH)) elif self.train_mode == 'WORD-NON-STATIC': self.vocab = preprocess.read_vocab(os.path.join('data', preprocess.WORD_VOCAB_PATH)) # 测试集有标题,读取时注意跳过第一行 dataset = TextLineDataset(os.path.join('data',preprocess.TEST_PATH)) dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch(self.test_batch_size) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() return dataset, next_element
class CNN(object): def __init__(self, config): self.class_num = config.class_num self.img_size = config.img_size self.crop_size = config.crop_size self.train_batch_size = config.train_batch_size self.test_batch_size = config.test_batch_size self.test_per_batch = config.test_per_batch self.batch_x = '' self.batch_y = '' self.input_x = '' self.labels = '' self.input_y = '' self.dropout_keep_prob = '' self.training = '' self.embedding_inputs = '' self.embedding_inputs_expanded = '' self.loss = '' self.accuracy = '' self.prediction = '' self.vocab = '' def _set_input(self): # Input layer self.input_x = tf.placeholder(tf.float32, [None, self.img_size, self.img_size, 1], name="input_x") self.labels = tf.placeholder(tf.int32, [None], name="labels") self.input_y = tf.one_hot(self.labels, self.class_num, name='input_y') self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # 训练时batch_normalization的Training参数应为True, # 验证或测试时应为False self.training = tf.placeholder(tf.bool, name='training') self.input_x_enhanced = tf.map_fn(self.image_enhance, self.input_x) def _conv(self, input, ksize, stride, filters): return tf.layers.conv2d( inputs=input, filters=filters, kernel_size=[ksize, ksize], strides=[stride, stride], padding='SAME', activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=tf.initializers.constant(0.0), ) def _maxpool_2x2(self, input): return tf.layers.max_pooling2d( inputs=input, pool_size=[2, 2], strides=[2, 2], padding='SAME', ) def _fc(self, input, units, dropout_keep_prob, name=None): fc_output = tf.layers.dense( inputs=input, units=units, activation=tf.nn.relu, kernel_initializer=tf.contrib.layers.xavier_initializer(), bias_initializer=tf.initializers.constant(0.0), kernel_regularizer=tf.contrib.layers.l2_regularizer(0.001), bias_regularizer=tf.contrib.layers.l2_regularizer(0.001), name=name ) return tf.layers.dropout(fc_output, dropout_keep_prob) def setVGG19(self): """ 在此函数中设定模型 :return: """ self._set_input() # 加入批标准化以减少过拟合 input_x_norm = tf.layers.batch_normalization(self.input_x_enhanced, training=self.training) # conv3-64 conv3_64_1 = self._conv(input_x_norm, 3, 1, 64) conv3_64_output = self._conv(conv3_64_1, 3, 1, 64) # maxpool-1 maxpool_1_output = self._maxpool_2x2(conv3_64_output) # conv3-128 conv3_128_1 = self._conv(maxpool_1_output, 3, 1, 128) conv3_128_output = self._conv(conv3_128_1, 3, 1, 128) # maxpool-2 maxpool_2_output = self._maxpool_2x2(conv3_128_output) # conv3-256 conv3_256_1 = self._conv(maxpool_2_output, 3, 1, 256) conv3_256_2 = self._conv(conv3_256_1, 3, 1, 256) conv3_256_3 = self._conv(conv3_256_2, 3, 1, 256) conv3_256_output = self._conv(conv3_256_3, 3, 1, 256) # maxpool-3 maxpool_3_output = self._maxpool_2x2(conv3_256_output) # conv3-512 conv3_512_1 = self._conv(maxpool_3_output, 3, 1, 512) conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512) conv3_512_3 = self._conv(conv3_512_2, 3, 1, 512) conv3_512_output = self._conv(conv3_512_3, 3, 1, 512) # maxpool-4 maxpool_4_output = self._maxpool_2x2(conv3_512_output) # conv4-512 conv3_512_1 = self._conv(maxpool_4_output, 3, 1, 512) conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512) conv3_512_3 = self._conv(conv3_512_2, 3, 1, 512) conv3_512_output = self._conv(conv3_512_3, 3, 1, 512) # maxpool-5 maxpool_5_output = self._maxpool_2x2(conv3_512_output) # flatten shape = maxpool_5_output.shape.as_list() dims = shape[1]*shape[2]*shape[3] maxpool_5_output_flatten = tf.reshape(maxpool_5_output, [-1, dims]) # fully-connected-1 fc_1 = self._fc(maxpool_5_output_flatten, 2048, self.dropout_keep_prob) # fully-connected-2 fc_2 = self._fc(fc_1, 2048, self.dropout_keep_prob) # fully-connected-3 fc_3 = self._fc(fc_2, 1000, self.dropout_keep_prob) # 输出层 self.score = self._fc(fc_3, self.class_num, self.dropout_keep_prob, name='score') self.prediction = tf.argmax(self.score, 1, name='prediction') # Loss function with tf.name_scope('loss'): losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.score, labels=self.input_y) self.loss = tf.reduce_mean(losses) + tf.losses.get_regularization_loss() # Calculate accuracy with tf.name_scope('accuracy'): correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) def convert_input(self, lines): """ 把读取的字符串数据转为形状为[batch_size, img_size, img_size, 1]的数组, 作为CNN的输入 :param pixels: :param labels: :return: """ batch_x = [] batch_y = [] for line in lines: line_ = line.decode('utf-8').strip().split(',') pixels = line_[1].split() # 像素值 label = line_[0] # 第一项为标签 batch_x.append([float(x) for x in pixels]) batch_y.append(int(label)) batch_x = np.stack(batch_x) batch_x = batch_x.reshape([-1, self.img_size, self.img_size, 1]) batch_y = np.asarray(batch_y) return batch_x, batch_y def prepare_data(self): self.train_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TRAIN_PATH)).skip(1) self.test_dataset = TextLineDataset(os.path.join('data', preprocess.FILTERED_TEST_PATH)).skip(1) print('Shuffling dataset...') # 打乱数据 train_dataset = self.train_dataset.shuffle(5000).batch(self.train_batch_size) test_dataset = self.test_dataset.shuffle(2500).batch(self.test_batch_size) # Create a reinitializable iterator train_iterator = train_dataset.make_initializable_iterator() test_iterator = test_dataset.make_initializable_iterator() train_init_op = train_iterator.initializer test_init_op = test_iterator.initializer # 要获取元素,先sess.run(train_init_op)初始化迭代器 # 再sess.run(next_train_element) next_train_element = train_iterator.get_next() next_test_element = test_iterator.get_next() return train_init_op, test_init_op, next_train_element, next_test_element # ============================================================== def image_enhance(self, image): """ 对图片进行数据增强 :param image: shape为[heigth, width, 1]的numpy数组或tensor :return: """ # 进行随机裁剪 images_crop = tf.image.random_crop(image, [self.crop_size, self.crop_size, 1]) # 随机水平翻转 images_crop = tf.image.random_flip_left_right(images_crop) # 随机对比度 images_crop = tf.image.random_contrast(images_crop, 0.5, 1.5) # 随机亮度 images_crop = tf.image.random_brightness(images_crop, max_delta=0.5) noise = tf.random_normal(shape=tf.shape(images_crop), mean=0.0, stddev=10.0, dtype=tf.float32) images_crop = tf.add(images_crop, noise) return images_crop def setVGG16(self): """ 在此函数中设定模型 :return: """ self._set_input() # 加入批标准化以减少过拟合 input_x_norm = tf.layers.batch_normalization(self.input_x_enhanced, training=self.training) # conv3-64 conv3_64_1 = self._conv(input_x_norm, 3, 1, 64) conv3_64_output = self._conv(conv3_64_1, 3, 1, 64) # maxpool-1 maxpool_1_output = self._maxpool_2x2(conv3_64_output) # conv3-128 conv3_128_1 = self._conv(maxpool_1_output, 3, 1, 128) conv3_128_output = self._conv(conv3_128_1, 3, 1, 128) # maxpool-2 maxpool_2_output = self._maxpool_2x2(conv3_128_output) # conv3-256 conv3_256_1 = self._conv(maxpool_2_output, 3, 1, 256) conv3_256_2 = self._conv(conv3_256_1, 3, 1, 256) conv3_256_output = self._conv(conv3_256_2, 1, 1, 256) # maxpool-3 maxpool_3_output = self._maxpool_2x2(conv3_256_output) # conv3-512 conv3_512_1 = self._conv(maxpool_3_output, 3, 1, 512) conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512) conv3_512_output = self._conv(conv3_512_2, 1, 1, 512) # maxpool-4 maxpool_4_output = self._maxpool_2x2(conv3_512_output) # conv4-512 conv3_512_1 = self._conv(maxpool_4_output, 3, 1, 512) conv3_512_2 = self._conv(conv3_512_1, 3, 1, 512) conv3_512_output = self._conv(conv3_512_2, 1, 1, 512) # maxpool-5 maxpool_5_output = self._maxpool_2x2(conv3_512_output) # flatten shape = maxpool_5_output.shape.as_list() dims = shape[1]*shape[2]*shape[3] maxpool_5_output_flatten = tf.reshape(maxpool_5_output, [-1, dims]) # fully-connected-1 fc_1 = self._fc(maxpool_5_output_flatten, 2048, self.dropout_keep_prob) # fully-connected-2 fc_2 = self._fc(fc_1, 2048, self.dropout_keep_prob) # fully-connected-3 fc_3 = self._fc(fc_2, 1000, self.dropout_keep_prob) # 输出层 self.score = self._fc(fc_3, self.class_num, self.dropout_keep_prob, name='score') self.prediction = tf.argmax(self.score, 1, name='prediction') # Loss function with tf.name_scope('loss'): losses = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.score, labels=self.input_y) self.loss = tf.reduce_mean(losses) + tf.losses.get_regularization_loss() # Calculate accuracy with tf.name_scope('accuracy'): correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))
class TextCNN(object): def __init__(self, config): self.class_num = config.class_num self.filter_sizes = config.filter_sizes self.filter_num = config.filter_num self.vocab_size = config.vocab_size self.dense_unit_num = config.dense_unit_num self.train_batch_size = config.train_batch_size self.valid_batch_size = config.valid_batch_size self.test_batch_size = config.test_batch_size if config.train_mode == 'CHAR-RANDOM': # 文本长度 self.text_length = preprocess.MAX_CHAR_TEXT_LENGTH # 词嵌入维度 self.embedding_dim = config.embedding_dim elif config.train_mode == 'WORD-NON-STATIC' or config.train_mode == 'MULTI': self.text_length = preprocess.MAX_WORD_TEXT_LENGTH self.embedding_dim = preprocess.vec_dim self.train_mode = config.train_mode self.input_x = None self.input_y = None self.labels = None self.dropout_keep_prob = None self.training = None self.embedding_inputs_expanded = None self.loss = None self.accuracy = None self.prediction = None self.vocab = None self.vecs_dict = {} self.embedding_W = None self.dataset = None def setCNN(self): # 输入层 self.input_x = tf.placeholder(tf.int32, [None, self.text_length], name="input_x") self.labels = tf.placeholder(tf.int32, [None], name="input_y") # 把数字标签转为one hot形式 self.input_y = tf.one_hot(self.labels, self.class_num) self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # 训练时batch_normalization的Training参数应为True, # 验证或测试时应为False self.training = tf.placeholder(tf.bool, name='training') # 词嵌入层 with tf.device('/cpu:0'), tf.name_scope('embedding'): if self.train_mode == 'CHAR-RANDOM': # 随机初始化的词向量 W = tf.Variable( tf.random_uniform([self.vocab_size, self.embedding_dim], -1.0, 1.0)) embedding_inputs = tf.nn.embedding_lookup(W, self.input_x) self.embedding_inputs_expanded = tf.expand_dims( embedding_inputs, -1) elif self.train_mode == 'WORD-NON-STATIC': # 用之前读入的预训练词向量 W = tf.Variable(self.embedding_W) embedding_inputs = tf.nn.embedding_lookup(W, self.input_x) self.embedding_inputs_expanded = tf.expand_dims( embedding_inputs, -1) elif self.train_mode == 'MULTI': W1 = tf.Variable(self.embedding_W) W2 = tf.Variable(self.embedding_W, trainable=False) embedding_inputs1 = tf.nn.embedding_lookup(W1, self.input_x) embedding_inputs2 = tf.nn.embedding_lookup(W2, self.input_x) self.embedding_inputs_expanded = tf.stack( [embedding_inputs1, embedding_inputs2], axis=-1) # The final pooling output, containing outputs from each filter pool_outputs = [] # Iterate to create convolution layer for each filter for filter_size in self.filter_sizes: with tf.name_scope("conv-maxpool-%d" % filter_size): # Convolution layer 1 # ================================================================== filter_shape = [filter_size, self.embedding_dim] conv_1 = tf.layers.conv2d( inputs=self.embedding_inputs_expanded, filters=self.filter_num, kernel_size=filter_shape, strides=[1, 1], padding='VALID', use_bias=True, kernel_initializer=tf.initializers.truncated_normal( stddev=0.1), bias_initializer=tf.initializers.constant(0.1)) # =================================================================== # Do batch normalization # ================================================================= conv_1_output = tf.layers.batch_normalization( conv_1, training=self.training) conv_1_output = tf.nn.relu(conv_1_output) # ====================================================================== # Pooling layer 1 # ==================================================================== conv_1_output_shape = conv_1_output.shape.as_list() pool_1 = tf.layers.max_pooling2d( inputs=conv_1_output, pool_size=[conv_1_output_shape[1] - 1 + 1, 1], strides=[1, 1], padding='VALID') # ===================================================================== pool_outputs.append(pool_1) # Combine all the pooling output # The total number of filters. total_filter_num = self.filter_num * len(self.filter_sizes) h_pool = tf.concat(pool_outputs, 3) h_pool_flat = tf.reshape(h_pool, [-1, total_filter_num]) # Output shape[batch, total_filter_num] # Full-connected layer # ======================================================================== with tf.name_scope('dense-%d' % self.dense_unit_num): h_full = tf.layers.dense( h_pool_flat, units=self.dense_unit_num, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), bias_initializer=tf.constant_initializer(0.1)) h_full = tf.layers.dropout(h_full, rate=self.dropout_keep_prob) h_full = tf.nn.relu(h_full) # ========================================================================= # Output layer with tf.name_scope('output'): score = tf.layers.dense( h_full, units=self.class_num, activation=None, use_bias=True, kernel_initializer=tf.truncated_normal_initializer(stddev=0.1), bias_initializer=tf.constant_initializer(0.1)) self.score = tf.multiply(score, 1, name='score') self.prediction = tf.argmax(score, 1, name='prediction') # Loss function with tf.name_scope('loss'): losses = tf.nn.softmax_cross_entropy_with_logits_v2( logits=score, labels=self.input_y) self.loss = tf.reduce_mean(losses) # Calculate accuracy with tf.name_scope('accuracy'): correct_predictions = tf.equal(self.prediction, tf.argmax(self.input_y, 1)) self.accuracy = tf.reduce_mean( tf.cast(correct_predictions, tf.float32)) def convert_input(self, lines): """ 将训练集数据转换为id或词向量表示 """ batch_x = [] batch_y = [] title = "" for line in lines: line_ = line.decode("gbk").strip().split(',') title = ''.join(line_[0:-1]) # 逗号前段为标题 label = ''.join(line_[-1]) # 最后一项为标签 batch_x.append(preprocess.to_id(title, self.vocab, self.train_mode)) batch_y.append(label) batch_x = np.stack(batch_x) return batch_x, batch_y def convert_test_input(self, titles): """ 将测试集tsv数据转为id或词向量表示 :param titles: :return: """ batch_x = [] # 1.id for title in titles: valid_title = title.decode('gb18030').strip('\t') batch_x.append( preprocess.to_id(valid_title, self.vocab, self.train_mode)) batch_x = np.stack(batch_x) return batch_x def prepare_data(self): # Data preparation. # ======================================================= if self.train_mode == 'CHAR-RANDOM': # 1.字符级 # 读取词汇表 self.vocab = preprocess.read_vocab( os.path.join('data', preprocess.CHAR_VOCAB_PATH)) elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI': # 把预训练词向量的值读到变量中 self.vocab = preprocess.read_vocab( os.path.join('data', preprocess.WORD_VOCAB_PATH)) self.vecs_dict = preprocess.load_vecs( os.path.join('data', preprocess.SGNS_WORD_PATH)) self.embedding_W = np.ndarray( shape=[self.vocab_size, self.embedding_dim], dtype=np.float32) for word in self.vocab: # 第n行对应id为n的词的词向量 if word not in self.vecs_dict: preprocess.add_word(word, self.vecs_dict) self.embedding_W[self.vocab[word]] = self.vecs_dict[word] self.dataset = TextLineDataset( os.path.join('data', preprocess.TRAIN_WITH_ID_PATH)) print('Shuffling dataset...') self.dataset = self.dataset.shuffle(preprocess.TOTAL_TRAIN_SIZE) # 分割数据集 # 取前VALID_SIZE个样本给验证集 valid_dataset = self.dataset.take(preprocess.VALID_SIZE).batch( self.valid_batch_size) # 剩下的给训练集 train_dataset = self.dataset.skip(preprocess.VALID_SIZE).batch( self.train_batch_size) # Create a reinitializable iterator train_iterator = train_dataset.make_initializable_iterator() valid_iterator = valid_dataset.make_initializable_iterator() train_init_op = train_iterator.initializer valid_init_op = valid_iterator.initializer # 要获取元素,先sess.run(train_init_op)初始化迭代器 # 再sess.run(next_train_element) next_train_element = train_iterator.get_next() next_valid_element = valid_iterator.get_next() return train_init_op, valid_init_op, next_train_element, next_valid_element # ============================================================= # Date preparation ends. def prepare_test_data(self): # 读取词汇表 if self.train_mode == 'CHAR-RANDOM': # 1.字符级 self.vocab = preprocess.read_vocab( os.path.join('data', preprocess.CHAR_VOCAB_PATH)) elif self.train_mode == 'WORD-NON-STATIC' or self.train_mode == 'MULTI': self.vocab = preprocess.read_vocab( os.path.join('data', preprocess.WORD_VOCAB_PATH)) # 测试集有标题,读取时注意跳过第一行 dataset = TextLineDataset(os.path.join('data', preprocess.TEST_PATH)) dataset = dataset.shuffle(preprocess.TOTAL_TEST_SIZE).batch( self.test_batch_size) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() return dataset, next_element