Ejemplo n.º 1
0
  def generate_data(self):
    """Generate data for offline training."""
    if self.infer_without_label:
      column_num = 1
      text_ds = load_textline_dataset(self.paths_after_pre_process, column_num)
    else:
      column_num = 3
      intent_label_ds, slots_label_ds, text_ds = load_textline_dataset(
          self.paths_after_pre_process, column_num)

    logging.info("Loading text dataset...")
    input_pipeline_func = self.get_input_pipeline(for_export=False)
    text_ds = text_ds.map(
        input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
    text_size_ds = text_ds.map(
        lambda x: compute_sen_lens(x, padding_token=0),
        num_parallel_calls=self.num_parallel_calls)
    text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

    if self.infer_without_label:
      data_set = text_ds
    else:
      intent_label_ds = process_one_label_dataset(
          intent_label_ds, self.config, output_index=0)
      slots_label_ds = process_multi_label_dataset(
          slots_label_ds, self.config, output_index=1)
      data_set = tf.data.Dataset.zip((text_ds, intent_label_ds, slots_label_ds))

    self.config['data']['vocab_size'] = get_vocab_size(
        self.text_vocab_file_path)
    self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
        self.paths_after_pre_process)

    return data_set
Ejemplo n.º 2
0
    def prepare_raw_data(self, pre_process_pipeline):
        """
    Preparing raw data.
    For all kinds of text input, all_texts: [sentence1, ...]
    For single output, all_labels: [[label1, label2, ...]]
    For multiple outputs, all_labels: [[label1_1, ...], [label1_2, ...]]
    """
        if self.output_num <= 1:
            all_labels = []
        else:
            all_labels = [[] for _ in range(self.output_num)]
        all_texts = []
        for mode in self.all_modes:
            paths = self.config["data"][mode]['paths']
            paths_after_pre_process = [
                one_path + ".after" for one_path in paths
            ]
            logging.debug(
                "paths_after_pre_process: {}".format(paths_after_pre_process))

            infer_without_label = bool(mode == utils.INFER
                                       and self.infer_no_label)

            for one_path, one_path_after in zip(paths,
                                                paths_after_pre_process):
                data_size = get_file_len([one_path])
                self.prepare_one_raw_data([one_path], one_path_after, mode,
                                          infer_without_label,
                                          pre_process_pipeline, all_texts,
                                          all_labels, data_size)
        if self.output_num <= 1:
            all_labels = [all_labels]
        return all_texts, all_labels
Ejemplo n.º 3
0
    def generate_data(self):
        """Generate data for offline training."""
        paths = self.paths
        if self.infer_without_label:
            self.column_num = 1
            text_ds = load_textline_dataset(paths, self.column_num)
        else:
            self.column_num = 2
            label_ds, text_ds = load_textline_dataset(paths, self.column_num)

        logging.info("process text ds...")
        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)
        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        logging.info("process label ds...")
        if self.infer_without_label:
            data_set = text_ds
        else:
            label_ds = process_multi_label_dataset(label_ds, self.config)
            data_set = tf.data.Dataset.zip((text_ds, label_ds))

        self.config['data']['vocab_size'] = get_vocab_size(
            self.text_vocab_file_path)
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths)

        return data_set
Ejemplo n.º 4
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 1
            text_ds = load_textline_dataset(self.paths_after_pre_process,
                                            column_num)
        else:
            column_num = 2
            label_ds, text_ds = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)

        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        if self.use_dense:
            dense = load_npy(self.dense_npy)
            dense_ds = load_dense_dataset(dense)

        if self.infer_without_label:
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds))
            else:
                data_set = text_ds
        else:
            label_ds = process_one_label_dataset(label_ds, self.config)
            if self.use_dense:
                data_set = tf.data.Dataset.zip((text_ds, dense_ds, label_ds))
            else:
                data_set = tf.data.Dataset.zip((text_ds, label_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        if self.split_token != "":
            if self.split_token not in vocab_dict:
                raise ValueError(
                    "The Model uses split token: {}, not in corpus.".format(
                        self.split_token))
            self.config['data']['split_token'] = int(
                vocab_dict[self.split_token])
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set
Ejemplo n.º 5
0
    def generate_data(self):
        """Generate data for offline training."""
        if self.infer_without_label:
            column_num = 2
            text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)
        else:
            column_num = 3
            label, text_ds_left, text_ds_right = load_textline_dataset(
                self.paths_after_pre_process, column_num)

        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds_left = text_ds_left.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_ds_right = text_ds_right.map(
            input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
        text_size_ds_left = text_ds_left.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_size_ds_right = text_ds_right.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds_left_right = tf.data.Dataset.zip((text_ds_left, text_ds_right))
        text_len_left_right = tf.data.Dataset.zip(
            (text_size_ds_left, text_size_ds_right))
        if self.infer_without_label:
            data_set_left_right = text_ds_left_right
        else:
            label_ds = process_one_label_dataset(label, self.config)
            data_set_left_right = tf.data.Dataset.zip(
                (text_ds_left_right, label_ds))
        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)

        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths_after_pre_process)

        return data_set_left_right, text_len_left_right
Ejemplo n.º 6
0
    def generate_data(self):
        """Generate data for offline training."""

        column_num = 1
        src_path = self.src_paths_after_pre_process
        target_path = self.tgt_paths_after_pre_process

        src_ds = load_textline_dataset([src_path], column_num)

        src_ds = src_ds[0]

        input_pipeline_func = self.get_input_pipeline(for_export=False)

        src_ds = src_ds.map(input_pipeline_func,
                            num_parallel_calls=self.num_parallel_calls)

        src_size_ds = src_ds.map(
            lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
            num_parallel_calls=self.num_parallel_calls)

        src_ds = src_ds.map(self.exclude_padding,
                            num_parallel_calls=self.num_parallel_calls)

        if self.infer_without_label:
            data_set = tf.data.Dataset.zip((src_ds, src_size_ds))

        else:
            tgt = load_textline_dataset([target_path], column_num)
            tgt = tgt[0]
            tgt_out_ds = tgt.map(lambda x: x + ' ' + self.END_TOKEN)
            tgt_in_ds = tgt.map(lambda x: self.START_TOKEN + ' ' + x)

            tgt_in_ds = tgt_in_ds.map(
                lambda batch: self.text_pipeline_func(
                    batch, self.max_dec_len, self.text_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_size_ds = tgt_in_ds.map(
                lambda x: compute_sen_lens(x, padding_token=utils.PAD_IDX),
                num_parallel_calls=self.num_parallel_calls)

            tgt_in_ds = tgt_in_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)

            inp_ds = tf.data.Dataset.zip(
                (src_ds, src_size_ds, tgt_in_ds, tgt_in_size_ds))

            if self.use_label_vocab:
                target_vocab_file_path = self.label_vocab_file_paths[0]
            else:
                target_vocab_file_path = self.text_vocab_file_path
            tgt_out_ds = tgt_out_ds.map(
                lambda batch: self.text_pipeline_func(batch, self.max_dec_len,
                                                      target_vocab_file_path),
                num_parallel_calls=self.num_parallel_calls)

            tgt_out_ds = tgt_out_ds.map(
                self.exclude_padding,
                num_parallel_calls=self.num_parallel_calls)
            data_set = tf.data.Dataset.zip((inp_ds, tgt_out_ds))

        vocab_dict = load_vocab_dict(self.text_vocab_file_path)
        vocab_size = len(vocab_dict)
        label_vocab_dict = load_vocab_dict(self.label_vocab_file_paths[0])
        label_vocab_size = len(label_vocab_dict)
        data_size = get_file_len(self.src_paths_after_pre_process)
        self.config['data']['vocab_size'] = vocab_size
        self.config['data']['label_vocab_size'] = label_vocab_size
        self.config['data']['{}_data_size'.format(self.mode)] = data_size

        return data_set
Ejemplo n.º 7
0
 def test_get_file_name(self):
     paths = self.config["data"]["train"]["paths"]
     self.assertEqual(get_file_len(paths), 300)