def load(self): # Load vocabulary self.vocab_words = load_vocab(self.output_words) self.vocab_tags = load_vocab(self.output_tags) self.vocab_chars = load_vocab(self.output_chars) self.nwords = len(self.vocab_words) self.nchars = len(self.vocab_chars) self.ntags = len(self.vocab_tags) # Get processing words self.processing_word = get_processing_word(self.vocab_words,self.vocab_chars, lowercase=True, chars=self.use_chars) self.processing_tag = get_processing_word(self.vocab_tags, lowercase=False, allow_unk=False) # Get pre-trained embeddings self.embeddings = (get_trimmed_glove_vectors(self.output_trimmed) if self.use_pretrained else None)
def __init__(self): self.vocabulary, self.vocabulary_reverse = load_vocab(args.data_path) tf.reset_default_graph() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) with tf.name_scope("Train"): with tf.variable_scope("Model"): self.model = SEQ2SEQ(session, options, "predict") self.model.restore(os.path.join(args.root_path, args.restore_path))
def create_dataset(config): dataset_class = None dataset_params_train = {} dataset_params_val = {} dataset_params_test = {} def add_dataset_param(name, value_train, value_dev=None, value_test=None): dataset_params_train[name] = value_train dataset_params_val[ name] = value_dev if value_dev is not None else value_train dataset_params_test[ name] = value_test if value_test is not None else value_train titles_train, titles_val, titles_test = load_titles() add_dataset_param('titles', titles_train, titles_val, titles_test) if config.model == 'counts' or config.model == 'counts_deep': dataset_class = BiosCountsDataset bios_train, bios_val, bios_test = load_bios_counts(config.scrubbed) add_dataset_param('bios', bios_train, bios_val, bios_test) features_names = load_features_names(config.scrubbed) add_dataset_param('feature_names', features_names) elif config.model == 'han' or config.model == 'rnn': dataset_class = BiosSeqDataset bios_train, bios_val, bios_test = load_bios_seq(config.scrubbed) add_dataset_param('bios', bios_train, bios_val, bios_test) vocab = load_vocab(config.scrubbed) add_dataset_param('vocab', vocab) else: raise ValueError(f'Dataset for the model {config.model} is unknown') dataset_train = dataset_class(**dataset_params_train) dataset_val = dataset_class(**dataset_params_val) dataset_test = dataset_class(**dataset_params_test) print( f'Dataset: {type(dataset_train).__name__} - {len(dataset_train)}, {len(dataset_val)}, {len(dataset_test)}' ) return dataset_train, dataset_val, dataset_test
def read_test_data(): with open(args.test_file, "r", encoding="utf-8") as f: examples, responses = [], [] for line in f: example = line.split("\t")[-6:] example = [ s.lower().split()[:args.max_utterance_len] for s in example ] examples.append(example) responses.append(example[-1]) return examples, responses if __name__ == "__main__": vocabulary, vocabulary_reverse = load_vocab(args.data_path) examples, responses = read_test_data() tf.reset_default_graph() tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True session = tf.Session(config=tf_config) with tf.name_scope("Train"): with tf.variable_scope("Model"): model = SEQ2SEQ(session, options, "predict") model.restore(os.path.join(args.root_path, args.restore_path)) num_examples = len(examples) num_batches = num_examples // options.batch_size predict_responses = []
def __init__(self, is_training=True): self.graph = Graph() with self.graph as_default(): if is_training: self.x, self.y = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, pre.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, pre.maxlen)) self.decoder_input = tf.concat((tf.ones_like(self.y[:,:1])*2,self.y[:,:-1]),axis=-1) en2id, id2en = load_vocab("./data/en_words.txt") ch2id, id2ch = load_vocab("./data/ch_words.txt") with tf.variable_scope("encoder"): self.enc = embedding(self.x, vocab_size=len(en2id), num_units=pre.embedding) key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.enc),axis=-1)), -1) self.enc = self.enc + position_embedding(self.x, embedding_dim=pre.embedding) self.enc = self.enc*key_masks self.enc = tf.layers.dropout(self.enc, rate=0.1, training=tf.convert_to_tensor(is_training)) for i in range(6): self.enc = multihead_attention(queries=self.enc, keys=self.enc, embedding_dim=pre.embedding, num_head=8, dropout_rate=0.1, is_training=is_training, future_blind=False) self.enc = feedforward(inputs=self.enc) with tf.variable_scope("decode"): self.dec = embedding(inputs=self.y, vocab_size=len(ch2id), embedding_dim=pre.embedding) key_masks = tf.expand_dims(tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)),-1) self.dec = self.dec + position_embedding(self.y, embedding_dim=pre.embedding) self.dec = self.dec*key_masks self.dec = tf.layers.dropout(self.dec, rate=0.1, training=tf.convert_to_tensor(is_training)) for i in range(6): self.dec = multihead_attention(queries=self.dec, keys=self.dec, embedding_dim=pre.embedding, num_head=8, dropout_rate=0.1, is_training=is_training, future_blind=True) self.dec = multihead_attention(queries=self.dec, keys=self.enc, embedding_dim=pre.embedding, num_head=8, dropout_rate=0.1, is_training=is_training, future_blind=False) self.dec = feedforward(self.dec) self.logits = tf.layers.dense(self.dec, len(ch2id)) self.preds = tf.to_int32(tf.argmax(self.logits,axis=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum(tf.to_float(tf.equal(self.preds,self.y))*self.istarget)/(tf.reduce_sum(self.istarget)) if is_training: self.y_smoothed = label_smoothing(tf.onehot(self.y, depth=len(ch2id))) self.loss = tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits) self.mean_loss = tf.reduce_sum(self.loss*self.istarget)/tf.reduce_sum(self.istarget) self.optimizer = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.98, epsilon=1e-8) self.opt = self.optimizer.minimize(self.mean_loss)
dataset_dev = DatasetHandler(config.conll_dev, target_word) dataset_test = DatasetHandler(config.conll_test, target_word) dataset_train = DatasetHandler(config.conll_train, target_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs( [dataset_train, dataset_dev, dataset_test]) vocab_glove = get_glove_vocab(config.output_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Process vocab write_vocab(vocab, config.output_words) write_vocab(vocab_tags, config.output_tags) vocab = load_vocab(config.output_words) export_trimmed_glove_vectors(vocab, config.output_glove, config.output_trimmed, config.dim_word) # Build and save char vocab train = DatasetHandler(config.conll_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.output_chars) # build model train_config = Config() model = BilstmModel(train_config) model.build() # create datasets dev = DatasetHandler(train_config.conll_dev, train_config.processing_word,
num_hidden_layers = CONFIG["num_hidden_layers"] embedding_size = CONFIG["embedding_size"] char_embedding_size = CONFIG.get("char_embedding_size", 100) nb_filters = CONFIG.get("nb_filters", 10) hidden_layer_size = CONFIG["hidden_layer_size"] RNN_LAYER_TYPE = CONFIG.get("RNN_LAYER_TYPE", "LSTM") optimizer = CONFIG["optimizer"] n_epochs = CONFIG["n_epochs"] + base_epochs save_every = CONFIG["save_every"] model_type = CONFIG.get("model_type", "rnn") # rnn, brnn RNN_CLASS = LSTM if RNN_LAYER_TYPE == "GRU": RNN_CLASS = GRU index_word, word_dict = pp.load_vocab(vocab_file) char_dict = {} if char_vocab_file is not None: index_char, char_dict = pp.load_vocab(char_vocab_file) char_vocab_size = len(index_char) + 2 # Add offset for OOV and padding pp.WordToken.set_vocab(word_dict = word_dict, char_dict = char_dict) index_labels, labels_dict = pp.load_vocab(labels_file) index_boundary, boundary_dict = pp.load_vocab(boundary_file) index_category, category_dict = pp.load_vocab(category_file) vocab_size = len(index_word) + pp.WordToken.VOCAB + 1 # Add offset of VOCAB and then extra token for padding labels_size = len(index_labels) + 1 # Add extra token for padding boundary_size = len(index_boundary) + 1 # Add extra token for padding category_size = len(index_category) + 1 # Add extra token for padding logger.info("Parameters: vocab_size = %s, label_type = %s, labels_size = %s, embedding_size = %s, maxlen = %s, boundary_size = %s, category_size = %s, embedding_size = %s, hidden_layer_size = %s" %\ (vocab_size, label_type, labels_size, embedding_size, maxlen, boundary_size, category_size, embedding_size, hidden_layer_size))
# -*- coding: utf-8 -*- # @Time : 2020/8/21 下午2:34 # @Author : chezhonghao # @description : # @File : predict_service.py # @Software: PyCharm import os os.environ['CUDA_VISIBLE_DEVICES']="-1" from base_transformer_model.config import Config from preprocess import tfrecord_reader,encode_sentence,load_vocab from base_transformer_model.component import create_look_ahead_mask,create_padding_mask import tensorflow as tf model=tf.saved_model.load('./saved_transformer_model',tags=[tf.saved_model.SERVING]) # inference=base_transformer_model.signatures['serving_default'] t2i, i2t = load_vocab('./transformer_model/vocab.txt') def create_mask(dialogue, question, report): # 编码器填充遮挡 dialogue_padding_mask = create_padding_mask(dialogue) # 在解码器的第二个注意力模块使用。 # 该填充遮挡用于遮挡编码器的输出。 question_padding_mask = create_padding_mask(question) # 在解码器的第一个注意力模块使用。 # 用于填充(pad)和遮挡(mask)解码器获取到的输入的后续标记(future tokens)。 look_ahead_mask = create_look_ahead_mask(tf.shape(report)[1]) dec_target_padding_mask = create_padding_mask(report) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return dialogue_padding_mask, combined_mask, question_padding_mask def predict(inp_sentence,question,report): dialogue=encode_sentence(mode='dialogue',sentence=inp_sentence,token2id=t2i,maxlen=Config.max_len['dialogue']) question = encode_sentence(mode='question', sentence=question, token2id=t2i, maxlen=Config.max_len['question']) report=encode_sentence(mode='tar_report_input',sentence='',token2id=t2i,maxlen=Config.max_len['report'])
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=100000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps**-1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2) t2i, i2t = load_vocab(Config.vocab_path) model.load_weights('../train_transformer_saved_weights/') def create_mask(dialogue, question, report): # 编码器填充遮挡 dialogue_padding_mask = create_padding_mask(dialogue) # 在解码器的第二个注意力模块使用。 # 该填充遮挡用于遮挡编码器的输出。 question_padding_mask = create_padding_mask(question) # 在解码器的第一个注意力模块使用。 # 用于填充(pad)和遮挡(mask)解码器获取到的输入的后续标记(future tokens)。 look_ahead_mask = create_look_ahead_mask(tf.shape(report)[1]) dec_target_padding_mask = create_padding_mask(report) combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask) return dialogue_padding_mask, combined_mask, question_padding_mask
def embedding_layer(self): context_emb = None query_emb = None reply_emb = None if self.config.use_word_embeddings: self.embedding_dim += self.config.word_embeddings_dim context_word_input = Input(shape=(self.config.max_utterance_num * self.config.max_utterance_len, ), dtype='int32', name='context_word_input') query_word_input = Input(shape=(self.config.max_utterance_len, ), dtype='int32', name='query_word_input') reply_word_input = Input(shape=(self.config.max_utterance_len, ), dtype='int32', name='reply_word_input') self.input_list.extend( [context_word_input, query_word_input, reply_word_input]) # load word embeddings emb_matrix = load_word_embeddings( self.config, self.config.word_embeddings_file, load_vocab(self.config.word_vocab_file)) word_emb_layer = Embedding( input_dim=self.config.word_vocab_size, output_dim=self.config.word_embeddings_dim, weights=[emb_matrix], trainable=True) context_word_emb = word_emb_layer(context_word_input) context_word_emb = Reshape((self.config.max_utterance_num, self.config.max_utterance_len, \ self.config.word_embeddings_dim))(context_word_emb) query_word_emb = word_emb_layer(query_word_input) reply_word_emb = word_emb_layer(reply_word_input) context_emb = context_word_emb query_emb = query_word_emb reply_emb = reply_word_emb if self.config.use_char_embeddings: self.embedding_dim += self.config.char_features_dim context_char_input = Input(shape=(self.config.max_utterance_num, self.config.max_utterance_len, self.config.max_token_len), dtype='int32', name='context_char_input') query_char_input = Input(shape=(self.config.max_utterance_len, self.config.max_token_len), dtype='int32', name='query_char_input') reply_char_input = Input(shape=(self.config.max_utterance_len, self.config.max_token_len), dtype='int32', name='reply_char_input') self.input_list.extend( [context_char_input, query_char_input, reply_char_input]) char_emb_layer = Embedding( input_dim=self.config.char_vocab_size, output_dim=self.config.char_embeddings_dim) context_char_emb = Reshape((self.config.max_utterance_num * self.config.max_utterance_len * self.config.max_token_len,\ ))(context_char_input) query_char_emb = Reshape( (self.config.max_utterance_len * self.config.max_token_len, ))(query_char_input) reply_char_emb = Reshape( (self.config.max_utterance_len * self.config.max_token_len, ))(reply_char_input) context_char_emb = char_emb_layer(context_char_emb) query_char_emb = char_emb_layer(query_char_emb) reply_char_emb = char_emb_layer(reply_char_emb) char_cnn_layer = Conv1D(filters=self.config.char_features_dim, kernel_size=self.config.char_kernel_shape,\ activation='tanh') context_char_emb = Reshape((self.config.max_utterance_num * self.config.max_utterance_len, \ self.config.max_token_len, self.config.char_embeddings_dim))(context_char_emb) context_char_emb = TimeDistributed(char_cnn_layer)( context_char_emb) context_char_emb = TimeDistributed( GlobalMaxPooling1D())(context_char_emb) context_char_emb = Reshape( (self.config.max_utterance_num, self.config.max_utterance_len, self.config.char_features_dim))(context_char_emb) query_char_emb = Reshape((self.config.max_utterance_len, self.config.max_token_len, \ self.config.char_embeddings_dim))(query_char_emb) query_char_emb = TimeDistributed(char_cnn_layer)(query_char_emb) query_char_emb = TimeDistributed( GlobalMaxPooling1D())(query_char_emb) query_char_emb = Reshape( (self.config.max_utterance_len, self.config.char_features_dim))(query_char_emb) reply_char_emb = Reshape((self.config.max_utterance_len, self.config.max_token_len, \ self.config.char_embeddings_dim))(reply_char_emb) reply_char_emb = TimeDistributed(char_cnn_layer)(reply_char_emb) reply_char_emb = TimeDistributed( GlobalMaxPooling1D())(reply_char_emb) reply_char_emb = Reshape( (self.config.max_utterance_len, self.config.char_features_dim))(reply_char_emb) if context_emb is not None: context_emb = ly.concatenate( inputs=[context_emb, context_char_emb]) query_emb = ly.concatenate(inputs=[query_emb, query_char_emb]) reply_emb = ly.concatenate(inputs=[reply_emb, reply_char_emb]) else: context_emb = context_char_emb query_emb = query_char_emb reply_emb = reply_char_emb self.embedding_dim += 1 context_feature_input = Input(shape=(self.config.max_utterance_num * self.config.max_utterance_len,),\ dtype='float32', name='context_feature_input') query_feature_input = Input(shape=(self.config.max_utterance_len,), \ dtype='float32', name='query_feature_input') reply_feature_input = Input(shape=(self.config.max_utterance_len,),\ dtype='float32', name='reply_feature_input') self.input_list.extend( [context_feature_input, query_feature_input, reply_feature_input]) #assert context_emb is not None reply_emb = Dropout(self.config.dropout_rate)(reply_emb) context_feature = Reshape( (self.config.max_utterance_num, self.config.max_utterance_len, 1))(context_feature_input) context_emb = ly.concatenate(inputs=[context_emb, context_feature]) query_feature = Reshape( (self.config.max_utterance_len, 1))(query_feature_input) query_emb = ly.concatenate(inputs=[query_emb, query_feature]) reply_feature = Reshape( (self.config.max_utterance_len, 1))(reply_feature_input) reply_emb = ly.concatenate(inputs=[reply_emb, reply_feature]) if self.config.use_char_embeddings: return context_emb, context_char_emb, query_emb, query_char_emb, reply_emb, reply_char_emb else: return context_emb, query_emb, reply_emb
import preprocess preprocess.preprocess_data(read_file="data/sentences.train", write_file="data/sentences.train.preprocess", vocab_size=20000) vocab, inv_vocab = preprocess.load_vocab() data = preprocess.load_preprocessed_data() preprocess.preprocess_eval_data(vocab, read_file="data/sentences.test", write_file="data/sentences.test.preprocess") eval_data = preprocess.load_preprocessed_data( read_file="data/sentences.test.preprocess") print("[Preprocessing done]") print("vocab len=" + str(len(vocab))) print("sentences num=" + str(len(data))) print("sentence len=" + str(len(data[0]))) print("test num=" + str(len(eval_data))) print("test len=" + str(len(eval_data[0])))