def generate_data(self):
    """Generate data for offline training."""
    if self.infer_without_label:
      column_num = 1
      text_ds = load_textline_dataset(self.paths_after_pre_process, column_num)
    else:
      column_num = 3
      intent_label_ds, slots_label_ds, text_ds = load_textline_dataset(
          self.paths_after_pre_process, column_num)

    logging.info("Loading text dataset...")
    input_pipeline_func = self.get_input_pipeline(for_export=False)
    text_ds = text_ds.map(
        input_pipeline_func, num_parallel_calls=self.num_parallel_calls)
    text_size_ds = text_ds.map(
        lambda x: compute_sen_lens(x, padding_token=0),
        num_parallel_calls=self.num_parallel_calls)
    text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

    if self.infer_without_label:
      data_set = text_ds
    else:
      intent_label_ds = process_one_label_dataset(
          intent_label_ds, self.config, output_index=0)
      slots_label_ds = process_multi_label_dataset(
          slots_label_ds, self.config, output_index=1)
      data_set = tf.data.Dataset.zip((text_ds, intent_label_ds, slots_label_ds))

    self.config['data']['vocab_size'] = get_vocab_size(
        self.text_vocab_file_path)
    self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
        self.paths_after_pre_process)

    return data_set
Example #2
0
    def generate_data(self):
        """Generate data for offline training."""
        text, (intent_label, slots_label) = load_nlu_joint_raw_data(
            paths=self.paths_after_pre_process, mode=self.mode)

        text_placeholder = tf.placeholder(tf.string, name="text")
        intent_label_placeholder = tf.placeholder(tf.string,
                                                  name="intent_label")
        slots_label_placeholder = tf.placeholder(tf.string, name="slots_label")
        self.init_feed_dict[text_placeholder] = text
        self.init_feed_dict[intent_label_placeholder] = intent_label
        self.init_feed_dict[slots_label_placeholder] = slots_label

        text_ds = self.load_text_dataset(text_placeholder)

        if self.infer_without_label:
            data_set = text_ds
        else:
            intent_label_ds = load_one_label_dataset(intent_label,
                                                     self.config,
                                                     output_index=0)
            slots_label_ds = load_multi_label_dataset(slots_label,
                                                      self.config,
                                                      output_index=1)
            data_set = tf.data.Dataset.zip(
                (text_ds, intent_label_ds, slots_label_ds))

        self.config['data']['vocab_size'] = get_vocab_size(
            self.text_vocab_file_path)
        self.config['data']['{}_data_size'.format(self.mode)] = len(text)

        return data_set
Example #3
0
    def generate_data(self):
        """Generate data for offline training."""
        paths = self.paths
        if self.infer_without_label:
            self.column_num = 1
            text_ds = load_textline_dataset(paths, self.column_num)
        else:
            self.column_num = 2
            label_ds, text_ds = load_textline_dataset(paths, self.column_num)

        logging.info("process text ds...")
        input_pipeline_func = self.get_input_pipeline(for_export=False)
        text_ds = text_ds.map(input_pipeline_func,
                              num_parallel_calls=self.num_parallel_calls)
        text_size_ds = text_ds.map(
            lambda x: compute_sen_lens(x, padding_token=0),
            num_parallel_calls=self.num_parallel_calls)
        text_ds = tf.data.Dataset.zip((text_ds, text_size_ds))

        logging.info("process label ds...")
        if self.infer_without_label:
            data_set = text_ds
        else:
            label_ds = process_multi_label_dataset(label_ds, self.config)
            data_set = tf.data.Dataset.zip((text_ds, label_ds))

        self.config['data']['vocab_size'] = get_vocab_size(
            self.text_vocab_file_path)
        self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
            self.paths)

        return data_set
Example #4
0
    def generate_data(self):
        """Generate data for offline training."""
        text, label = load_seq_label_raw_data(
            paths=self.paths,
            mode=self.mode,
            infer_no_label=self.infer_no_label)

        text_placeholder = tf.placeholder(tf.string, name="text")
        label_placeholder = tf.placeholder(tf.string, name="label")
        self.init_feed_dict[text_placeholder] = text
        self.init_feed_dict[label_placeholder] = label

        text_ds = self.load_text_dataset(text_placeholder)

        if self.infer_without_label:
            data_set = text_ds
        else:
            label_ds = load_multi_label_dataset(label_placeholder, self.config)
            data_set = tf.data.Dataset.zip((text_ds, label_ds))

        self.config['data']['vocab_size'] = get_vocab_size(
            self.text_vocab_file_path)
        self.config['data']['{}_data_size'.format(self.mode)] = len(text)

        return data_set
Example #5
0
    def generate_data(self):
        """Generate data for offline training."""
        text, label = load_seq_label_raw_data(
            paths=self.paths,
            mode=self.mode,
            infer_no_label=self.infer_no_label)
        text_ds = self.load_text_dataset(text)

        if self.infer_without_label:
            data_set = text_ds
        else:
            label_ds = load_multi_label_dataset(label, self.config)
            data_set = tf.data.Dataset.zip((text_ds, label_ds))

        self.config['data']['vocab_size'] = get_vocab_size(
            self.text_vocab_file_path)
        self.config['data']['{}_data_size'.format(self.mode)] = len(text)

        return data_set
  def export_inputs(self):
    """Inputs for exported model."""
    self.config['data']['vocab_size'] = get_vocab_size(
        self.text_vocab_file_path)
    input_sentence = tf.placeholder(
        shape=(None,), dtype=tf.string, name="input_sentence")

    input_pipeline_func = self.get_input_pipeline(for_export=True)
    token_ids = input_pipeline_func(input_sentence)
    token_ids_len = tf.map_fn(lambda x: compute_sen_lens(x, padding_token=0),
                              token_ids)

    export_data = {
        "export_inputs": {
            "input_sentence": input_sentence
        },
        "model_inputs": {
            "input_x": token_ids,
            "input_x_len": token_ids_len
        }
    }

    return export_data