def generate_data(self): """Generate data for offline training.""" if self.infer_without_label: column_num = 1 text_ds = load_textline_dataset(self.paths_after_pre_process, column_num) else: column_num = 3 intent_label_ds, slots_label_ds, text_ds = load_textline_dataset( self.paths_after_pre_process, column_num) logging.info("Loading text dataset...") input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) if self.infer_without_label: data_set = text_ds else: intent_label_ds = process_one_label_dataset( intent_label_ds, self.config, output_index=0) slots_label_ds = process_multi_label_dataset( slots_label_ds, self.config, output_index=1) data_set = tf.data.Dataset.zip((text_ds, intent_label_ds, slots_label_ds)) self.config['data']['vocab_size'] = get_vocab_size( self.text_vocab_file_path) self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths_after_pre_process) return data_set
def prepare_raw_data(self, pre_process_pipeline): """ Preparing raw data. For all kinds of text input, all_texts: [sentence1, ...] For single output, all_labels: [[label1, label2, ...]] For multiple outputs, all_labels: [[label1_1, ...], [label1_2, ...]] """ if self.output_num <= 1: all_labels = [] else: all_labels = [[] for _ in range(self.output_num)] all_texts = [] for mode in self.all_modes: paths = self.config["data"][mode]['paths'] paths_after_pre_process = [ one_path + ".after" for one_path in paths ] logging.debug( "paths_after_pre_process: {}".format(paths_after_pre_process)) infer_without_label = bool(mode == utils.INFER and self.infer_no_label) for one_path, one_path_after in zip(paths, paths_after_pre_process): data_size = get_file_len([one_path]) self.prepare_one_raw_data([one_path], one_path_after, mode, infer_without_label, pre_process_pipeline, all_texts, all_labels, data_size) if self.output_num <= 1: all_labels = [all_labels] return all_texts, all_labels
def generate_data(self): """Generate data for offline training.""" paths = self.paths if self.infer_without_label: self.column_num = 1 text_ds = load_textline_dataset(paths, self.column_num) else: self.column_num = 2 label_ds, text_ds = load_textline_dataset(paths, self.column_num) logging.info("process text ds...") input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map(input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) logging.info("process label ds...") if self.infer_without_label: data_set = text_ds else: label_ds = process_multi_label_dataset(label_ds, self.config) data_set = tf.data.Dataset.zip((text_ds, label_ds)) self.config['data']['vocab_size'] = get_vocab_size( self.text_vocab_file_path) self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths) return data_set
def generate_data(self): """Generate data for offline training.""" if self.infer_without_label: column_num = 1 text_ds = load_textline_dataset(self.paths_after_pre_process, column_num) else: column_num = 2 label_ds, text_ds = load_textline_dataset( self.paths_after_pre_process, column_num) input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds = text_ds.map(input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds = text_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) text_ds = tf.data.Dataset.zip((text_ds, text_size_ds)) if self.use_dense: dense = load_npy(self.dense_npy) dense_ds = load_dense_dataset(dense) if self.infer_without_label: if self.use_dense: data_set = tf.data.Dataset.zip((text_ds, dense_ds)) else: data_set = text_ds else: label_ds = process_one_label_dataset(label_ds, self.config) if self.use_dense: data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds)) else: data_set = tf.data.Dataset.zip((text_ds, label_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) if self.split_token != "": if self.split_token not in vocab_dict: raise ValueError( "The Model uses split token: {}, not in corpus.".format( self.split_token)) self.config['data']['split_token'] = int( vocab_dict[self.split_token]) self.config['data']['vocab_size'] = vocab_size self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths_after_pre_process) return data_set
def generate_data(self): """Generate data for offline training.""" if self.infer_without_label: column_num = 2 text_ds_left, text_ds_right = load_textline_dataset( self.paths_after_pre_process, column_num) else: column_num = 3 label, text_ds_left, text_ds_right = load_textline_dataset( self.paths_after_pre_process, column_num) input_pipeline_func = self.get_input_pipeline(for_export=False) text_ds_left = text_ds_left.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_ds_right = text_ds_right.map( input_pipeline_func, num_parallel_calls=self.num_parallel_calls) text_size_ds_left = text_ds_left.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_size_ds_right = text_ds_right.map( lambda x: compute_sen_lens(x, padding_token=0), num_parallel_calls=self.num_parallel_calls) text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right)) text_len_left_right = tf.data.Dataset.zip( (text_size_ds_left, text_size_ds_right)) if self.infer_without_label: data_set_left_right = text_ds_left_right else: label_ds = process_one_label_dataset(label, self.config) data_set_left_right = tf.data.Dataset.zip( (text_ds_left_right, label_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) self.config['data']['vocab_size'] = vocab_size self.config['data']['{}_data_size'.format(self.mode)] = get_file_len( self.paths_after_pre_process) return data_set_left_right, text_len_left_right
def generate_data(self): """Generate data for offline training.""" column_num = 1 src_path = self.src_paths_after_pre_process target_path = self.tgt_paths_after_pre_process src_ds = load_textline_dataset([src_path], column_num) src_ds = src_ds[0] input_pipeline_func = self.get_input_pipeline(for_export=False) src_ds = src_ds.map(input_pipeline_func, num_parallel_calls=self.num_parallel_calls) src_size_ds = src_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) src_ds = src_ds.map(self.exclude_padding, num_parallel_calls=self.num_parallel_calls) if self.infer_without_label: data_set = tf.data.Dataset.zip((src_ds, src_size_ds)) else: tgt = load_textline_dataset([target_path], column_num) tgt = tgt[0] tgt_out_ds = tgt.map(lambda x: x + ' ' + self.END_TOKEN) tgt_in_ds = tgt.map(lambda x: self.START_TOKEN + ' ' + x) tgt_in_ds = tgt_in_ds.map( lambda batch: self.text_pipeline_func( batch, self.max_dec_len, self.text_vocab_file_path), num_parallel_calls=self.num_parallel_calls) tgt_in_size_ds = tgt_in_ds.map( lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX), num_parallel_calls=self.num_parallel_calls) tgt_in_ds = tgt_in_ds.map( self.exclude_padding, num_parallel_calls=self.num_parallel_calls) inp_ds = tf.data.Dataset.zip( (src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds)) if self.use_label_vocab: target_vocab_file_path = self.label_vocab_file_paths[0] else: target_vocab_file_path = self.text_vocab_file_path tgt_out_ds = tgt_out_ds.map( lambda batch: self.text_pipeline_func(batch, self.max_dec_len, target_vocab_file_path), num_parallel_calls=self.num_parallel_calls) tgt_out_ds = tgt_out_ds.map( self.exclude_padding, num_parallel_calls=self.num_parallel_calls) data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds)) vocab_dict = load_vocab_dict(self.text_vocab_file_path) vocab_size = len(vocab_dict) label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0]) label_vocab_size = len(label_vocab_dict) data_size = get_file_len(self.src_paths_after_pre_process) self.config['data']['vocab_size'] = vocab_size self.config['data']['label_vocab_size'] = label_vocab_size self.config['data']['{}_data_size'.format(self.mode)] = data_size return data_set
def test_get_file_name(self): paths = self.config["data"]["train"]["paths"] self.assertEqual(get_file_len(paths), 300)