def export_inputs(self): """Inputs for exported model.""" vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) self.config['data']['vocab_size'] = vocab_size input_sent_left = tf.placeholder( shape=(None,), dtype=tf.string, name="input_sent_left") input_sent_right = tf.placeholder( shape=(None,), dtype=tf.string, name="input_sent_right") input_pipeline_func = self.get_input_pipeline(for_export=True) token_ids_left = input_pipeline_func(input_sent_left) token_ids_right = input_pipeline_func(input_sent_right) token_ids_len_left = tf.map_fn( lambda x: compute_sen_lens(x, padding_token=0), token_ids_left) token_ids_len_right = tf.map_fn( lambda x: compute_sen_lens(x, padding_token=0), token_ids_right) export_data = { "export_inputs": { "input_sent_left": input_sent_left, "input_sent_right": input_sent_right, }, "model_inputs": { "input_x_left": token_ids_left, "input_x_right": token_ids_right, "input_x_left_len": token_ids_len_left, "input_x_right_len": token_ids_len_right, "input_x_len": [token_ids_len_left, token_ids_len_right] } } return export_data
def generate_data(self): """Generate data for offline training.""" (text_left, text_right), label = load_match_raw_data( paths=self.paths_after_pre_process, mode=self.mode) text_left_placeholder = tf.placeholder(tf.string, name="text_left") text_right_placeholder = tf.placeholder(tf.string, name="text_right") label_placeholder = tf.placeholder(tf.string, name="label") self.init_feed_dict[text_left_placeholder] = text_left self.init_feed_dict[text_right_placeholder] = text_right self.init_feed_dict[label_placeholder] = label text_ds_left = tf.data.Dataset.from_tensor_slices( text_left_placeholder) text_ds_right = tf.data.Dataset.from_tensor_slices( text_right_placeholder) input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds_left = text_ds_left.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_ds_right = text_ds_right.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds_left = text_ds_left.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_size_ds_right = text_ds_right.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right)) text_len_left_right = tf.data.Dataset.zip( (text_size_ds_left, text_size_ds_right)) if self.infer_without_label: data_set_left_right = text_ds_left_right else: label_ds = load_one_label_dataset(label_placeholder, self.config) data_set_left_right = tf.data.Dataset.zip( (text_ds_left_right, label_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) data_size = len(text_left) if self.split_token != "": if self.split_token not in vocab_dict: raise ValueError( "The Model uses split token: {}, not in corpus.".format( self.split_token)) self.config['data']['split_token'] = int( vocab_dict[self.split_token]) self.config['data']['vocab_size'] = vocab_size self.config['data']['{}_data_size'.format(self.mode)] = data_size return data_set_left_right, text_len_left_right
def export_inputs(self): """Inputs for exported model.""" vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0]) label_vocab_size = len(label_vocab_dict) self.config['data']['vocab_size'] = vocab_size self.config['data']['label_vocab_size'] = label_vocab_size input_sentence = tf.placeholder(shape=(None, ), dtype=tf.string, name="input_sentence") input_pipeline_func = self.get_input_pipeline(for_export=True) token_ids = input_pipeline_func(input_sentence) token_ids_len = tf.map_fn( lambda x: compute_sen_lens(x, padding_token=0), token_ids) export_data = { "export_inputs": { "input_sentence": input_sentence }, "model_inputs": { "input_enc_x": token_ids, "input_x_len": token_ids_len } } return export_data
def generate_data(self): """Generate data for offline training.""" if self.infer_without_label: column_num = 1 text_ds = load_textline_dataset(self.paths_after_pre_process, column_num) else: column_num = 3 intent_label_ds, slots_label_ds, text_ds = load_textline_dataset( self.paths_after_pre_process, column_num) logging.info("Loading text dataset...") input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) if self.infer_without_label: data_set = text_ds else: intent_label_ds = process_one_label_dataset( intent_label_ds, self.config, output_index=0) slots_label_ds = process_multi_label_dataset( slots_label_ds, self.config, output_index=1) data_set = tf.data.Dataset.zip((text_ds, intent_label_ds, slots_label_ds)) self.config['data']['vocab_size'] = get_vocab_size( self.text_vocab_file_path) self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths_after_pre_process) return data_set
def generate_data(self): """Generate data for offline training.""" paths = self.paths if self.infer_without_label: self.column_num = 1 text_ds = load_textline_dataset(paths, self.column_num) else: self.column_num = 2 label_ds, text_ds = load_textline_dataset(paths, self.column_num) logging.info("process text ds...") input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map(input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) logging.info("process label ds...") if self.infer_without_label: data_set = text_ds else: label_ds = process_multi_label_dataset(label_ds, self.config) data_set = tf.data.Dataset.zip((text_ds, label_ds)) self.config['data']['vocab_size'] = get_vocab_size( self.text_vocab_file_path) self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths) return data_set
def export_inputs(self): """Inputs for exported model.""" vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) if self.split_token != "": if self.split_token not in vocab_dict: raise ValueError( "The Model uses split token: {}, not in corpus.".format( self.split_token)) self.config['data']['split_token'] = int(vocab_dict[self.split_token]) self.config['data']['vocab_size'] = vocab_size input_sentence = tf.placeholder( shape=(None,), dtype=tf.string, name="input_sentence") input_pipeline_func = self.get_input_pipeline(for_export=True) token_ids = input_pipeline_func(input_sentence) token_ids_len = tf.map_fn(lambda x: compute_sen_lens(x, padding_token=0), token_ids) export_data = { "export_inputs": { "input_sentence": input_sentence }, "model_inputs": { "input_enc_x": token_ids, "input_x_len": token_ids_len } } return export_data
def call(self, inputs, training=None, mask=None): # pylint: disable=too-many-locals input_x = inputs["input_x"] if self.use_dense_task: dense_input = inputs["input_dense"] if self.use_true_length: # [batch_size, max_doc_len, max_sen_len] input_hx = self.pad_to_hier_input_true_len( input_x, self.max_doc_len, self.max_sen_len, self.split_token, padding_token=self.padding_token) else: # [batch_size, max_doc_len, max_sen_len] input_hx = self.pad_to_hier_input( input_x, self.max_doc_len, self.max_sen_len, padding_token=self.padding_token) # [batch_size, max_doc_len] sen_lens = compute_sen_lens(input_hx, padding_token=self.padding_token) # [batch_size] doc_lens = compute_doc_lens(sen_lens) # [batch_size, max_doc_len, max_sen_len, 1] sen_mask = tf.expand_dims( tf.sequence_mask(sen_lens, self.max_sen_len, dtype=tf.float32), axis=-1) # [batch_size, max_doc_len, 1] doc_mask = tf.expand_dims( tf.sequence_mask(doc_lens, self.max_doc_len, dtype=tf.float32), axis=-1) # [batch_size, max_doc_len, max_sen_len, embed_len] out = self.embed(input_hx) if self.use_pretrained_model: input_px = self.get_pre_train_graph(input_x) input_px = tf.reshape(input_px, [-1, self.max_doc_len, self.max_sen_len, self.pretrained_model_dim]) out = tf.concat([out, input_px], axis=-1) out = self.embed_d(out, training=training) all_sen_encoder = tf.keras.layers.TimeDistributed(self.sen_encoder) # [batch_size, max_doc_len, features] out = all_sen_encoder(out, training=training, mask=sen_mask) # [batch_size, features] out = self.doc_encoder(out, training=training, mask=doc_mask) if self.use_dense_input: dense_out = self.dense_input_linear(dense_input) if self.only_dense_input: out = dense_out else: out = tf.keras.layers.Concatenate()([out, dense_out]) # [batch_size, class_num] scores = self.final_dense(out) return scores
def generate_data(self): """Generate data for offline training.""" text, label = load_cls_raw_data(paths=self.paths_after_pre_process, mode=self.mode) text_placeholder = tf.placeholder(tf.string, shape=(None, ), name="text") label_placeholder = tf.placeholder(tf.string, name="label") self.init_feed_dict[text_placeholder] = text self.init_feed_dict[label_placeholder] = label # logging.debug("init_feed_dict: {}".format(self.init_feed_dict)) text_ds = tf.data.Dataset.from_tensor_slices(text_placeholder) input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map(input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) if self.use_dense: dense = load_npy(self.dense_npy) dense_ds = load_dense_dataset(dense) if self.infer_without_label: if self.use_dense: data_set = tf.data.Dataset.zip((text_ds, dense_ds)) else: data_set = text_ds else: label_ds = load_one_label_dataset(label_placeholder, self.config) if self.use_dense: data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds)) else: data_set = tf.data.Dataset.zip((text_ds, label_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) data_size = len(text) if self.split_token != "": if self.split_token not in vocab_dict: raise ValueError( "The Model uses split token: {}, not in corpus.".format( self.split_token)) self.config['data']['split_token'] = int( vocab_dict[self.split_token]) self.config['data']['vocab_size'] = vocab_size self.config['data']['{}_data_size'.format(self.mode)] = data_size return data_set
def load_text_dataset(self, text_ds): """Load text data set.""" logging.info("Loading text dataset...") input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) return text_ds
def generate_data(self): """Generate data for offline training.""" if self.infer_without_label: column_num = 1 text_ds = load_textline_dataset(self.paths_after_pre_process, column_num) else: column_num = 2 label_ds, text_ds = load_textline_dataset( self.paths_after_pre_process, column_num) input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map(input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) if self.use_dense: dense = load_npy(self.dense_npy) dense_ds = load_dense_dataset(dense) if self.infer_without_label: if self.use_dense: data_set = tf.data.Dataset.zip((text_ds, dense_ds)) else: data_set = text_ds else: label_ds = process_one_label_dataset(label_ds, self.config) if self.use_dense: data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds)) else: data_set = tf.data.Dataset.zip((text_ds, label_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) if self.split_token != "": if self.split_token not in vocab_dict: raise ValueError( "The Model uses split token: {}, not in corpus.".format( self.split_token)) self.config['data']['split_token'] = int( vocab_dict[self.split_token]) self.config['data']['vocab_size'] = vocab_size self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths_after_pre_process) return data_set
def generate_data(self): """Generate data for offline training.""" if self.infer_without_label: column_num = 2 text_ds_left, text_ds_right = load_textline_dataset( self.paths_after_pre_process, column_num) else: column_num = 3 label, text_ds_left, text_ds_right = load_textline_dataset( self.paths_after_pre_process, column_num) input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds_left = text_ds_left.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_ds_right = text_ds_right.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds_left = text_ds_left.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_size_ds_right = text_ds_right.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right)) text_len_left_right = tf.data.Dataset.zip( (text_size_ds_left, text_size_ds_right)) if self.infer_without_label: data_set_left_right = text_ds_left_right else: label_ds = process_one_label_dataset(label, self.config) data_set_left_right = tf.data.Dataset.zip( (text_ds_left_right, label_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) self.config['data']['vocab_size'] = vocab_size self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths_after_pre_process) return data_set_left_right, text_len_left_right
def call(self, inputs, training=None, mask=None): enc_inputs = inputs["input_enc_x"] seq_enc_len = compute_sen_lens(enc_inputs, padding_token=self.padding_token) enc_mask = self.mask_layer(enc_inputs) enc_inputs = self.embed(enc_inputs) enc_inputs = self.embed_d(enc_inputs) enc_outputs, enc_state = self.encoder(enc_inputs, training=training, mask=enc_mask) if self.is_infer: dec_outputs = self.decoder([enc_outputs, enc_state, seq_enc_len], training=training) return dec_outputs else: dec_inputs = inputs["input_dec_x"] seq_dec_len = compute_sen_lens(dec_inputs, padding_token=self.padding_token) dec_outputs = self.decoder( [dec_inputs, seq_dec_len, enc_outputs, enc_state, seq_enc_len], training=training) return dec_outputs
def test_compute_sen_lens(self): sentences = tf.placeholder(dtype=tf.int32) lens = compute_sen_lens(sentences) with self.cached_session(use_gpu=False, force_gpu=False) as sess: # test for 1d res = sess.run(lens, feed_dict={sentences: [1, 2, 0, 0]}) self.assertEqual(res, 2) # test for 2d res = sess.run(lens, feed_dict={sentences: [[1, 2, 0, 0], [1, 2, 3, 4]]}) self.assertAllEqual(res, [2, 4]) # test for 3d res = sess.run(lens, feed_dict={ sentences: [[[1, 2, 0, 0]], [[1, 2, 3, 4]], [[1, 0, 0, 0]]] }) self.assertAllEqual(res, [[2], [4], [1]])
def call(self, inputs, training=None, mask=None): input_x = inputs["input_x"] # [batch_size, max_len] input_x_lens = compute_sen_lens(input_x, padding_token=self.padding_token) # [batch_size, max_len, 1] mask = tf.expand_dims(tf.sequence_mask(input_x_lens, self.max_len, dtype=tf.float32), axis=-1) # [batch_size, max_len, embed_len] out = self.embed(input_x) # [batch_size, features] out = self.embed_dropout(out, training=training) out = self.bi_rnn(out) intent_out = self.attention(out, mask=mask) intent_out = self.dropout(intent_out) intent_out = self.intent_dense(intent_out) intent_out = tf.identity(intent_out, name="intent_logits") slots_out = self.dropout(out) slots_out = self.slots_dense(slots_out) slots_out = tf.identity(slots_out, name="slots_logits") return intent_out, slots_out
def generate_data(self): """Generate data for offline training.""" column_num = 1 src_path = self.src_paths_after_pre_process target_path = self.tgt_paths_after_pre_process src_ds = load_textline_dataset([src_path], column_num) src_ds = src_ds[0] input_pipeline_func = self.get_input_pipeline(for_export=False) src_ds = src_ds.map(input_pipeline_func, num_parallel_calls=self.num_parallel_calls) src_size_ds = src_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) src_ds = src_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls) if self.infer_without_label: data_set = tf.data.Dataset.zip((src_ds, src_size_ds)) else: tgt = load_textline_dataset([target_path], column_num) tgt = tgt[0] tgt_out_ds = tgt.map(lambda x: x + ' ' + self.END_TOKEN) tgt_in_ds = tgt.map(lambda x: self.START_TOKEN + ' ' + x) tgt_in_ds = tgt_in_ds.map( lambda batch: self.text_pipeline_func( batch, self.max_dec_len, self.text_vocab_file_path), num_parallel_calls=self.num_parallel_calls) tgt_in_size_ds = tgt_in_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) tgt_in_ds = tgt_in_ds.map( self.exclude_padding, num_parallel_calls=self.num_parallel_calls) inp_ds = tf.data.Dataset.zip( (src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds)) if self.use_label_vocab: target_vocab_file_path = self.label_vocab_file_paths[0] else: target_vocab_file_path = self.text_vocab_file_path tgt_out_ds = tgt_out_ds.map( lambda batch: self.text_pipeline_func(batch, self.max_dec_len, target_vocab_file_path), num_parallel_calls=self.num_parallel_calls) tgt_out_ds = tgt_out_ds.map( self.exclude_padding, num_parallel_calls=self.num_parallel_calls) data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0]) label_vocab_size = len(label_vocab_dict) data_size = get_file_len(self.src_paths_after_pre_process) self.config['data']['vocab_size'] = vocab_size self.config['data']['label_vocab_size'] = label_vocab_size self.config['data']['{}_data_size'.format(self.mode)] = data_size return data_set
def generate_data(self): """Generate data for offline training.""" src = load_seq2seq_raw_data(paths=self.src_paths_after_pre_process) tgt = load_seq2seq_raw_data(paths=self.tgt_paths_after_pre_process) tgt_out = [abs_ + ' ' + self.END_TOKEN for abs_ in tgt] tgt_in = [self.START_TOKEN + ' ' + abs_ for abs_ in tgt] assert len(src) == len(tgt_in) src_placeholder = tf.placeholder(tf.string, shape=(None,), name="src") tgt_out_placeholder = tf.placeholder(tf.string, name="tgt_out") tgt_in_placeholder = tf.placeholder(tf.string, name="tgt_in") self.init_feed_dict[src_placeholder] = src self.init_feed_dict[tgt_out_placeholder] = tgt_out self.init_feed_dict[tgt_in_placeholder] = tgt_in src_ds = tf.data.Dataset.from_tensor_slices(src_placeholder) tgt_in_ds = tf.data.Dataset.from_tensor_slices(tgt_in_placeholder) tgt_out_ds = tf.data.Dataset.from_tensor_slices(tgt_out_placeholder) input_pipeline_func = self.get_input_pipeline(for_export=False) src_ds = src_ds.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) src_size_ds = src_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) src_ds = src_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls) tgt_in_ds = tgt_in_ds.map( lambda batch: self.text_pipeline_func(batch, self.max_dec_len, self.text_vocab_file_path), num_parallel_calls=self.num_parallel_calls) tgt_in_size_ds = tgt_in_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) tgt_in_ds = tgt_in_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls) inp_ds = tf.data.Dataset.zip((src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds)) if self.infer_without_label: data_set = inp_ds else: if self.use_label_vocab: target_vocab_file_path = self.label_vocab_file_paths[0] else: target_vocab_file_path = self.text_vocab_file_path tgt_out_ds = tgt_out_ds.map( lambda batch: self.text_pipeline_func(batch, self.max_dec_len, target_vocab_file_path), num_parallel_calls=self.num_parallel_calls) tgt_out_ds = tgt_out_ds.map( self.exclude_padding, num_parallel_calls=self.num_parallel_calls ) data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0]) label_vocab_size = len(label_vocab_dict) data_size = len(src) self.config['data']['vocab_size'] = vocab_size self.config['data']['label_vocab_size'] = label_vocab_size self.config['data']['{}_data_size'.format(self.mode)] = data_size return data_set