Exemple #1
0
def load_multi_label_dataset(label_placeholder, config, output_index=None):
  """Load multi-label data set."""
  logging.info("Loading multi label dataset...")
  label_vocab_file_path = config["data"]["task"]["label_vocab"]
  num_parallel_calls = config["data"]["task"]["num_parallel_calls"]
  max_seq_len = config["data"]["task"]["max_seq_len"]

  label_vocab_file_path = config["data"]["task"]["label_vocab"]
  if isinstance(label_vocab_file_path, list):
    if output_index is None or output_index not in range(
        len(label_vocab_file_path)):
      raise IndexError("output_index:{} not in the range of classes length: "
                       "{}!".format(output_index, len(label_vocab_file_path)))
    label_vocab_file_path = label_vocab_file_path[output_index]

  else:
    label_vocab_file_path = label_vocab_file_path

  label_ds = tf.data.Dataset.from_tensor_slices(label_placeholder)
  label_ds = label_ds.map(
      lambda x: tokenize_label(
          x,
          maxlen=max_seq_len,
          label_vocab_file_path=label_vocab_file_path,
          pad_id=0),
      num_parallel_calls=num_parallel_calls)
  label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls)

  return label_ds
Exemple #2
0
def load_one_label_dataset(label_placeholder, config, output_index=None):
  """Load one-label data set."""
  logging.info("Loading one label dataset...")
  num_parallel_calls = config["data"]["task"]["num_parallel_calls"]
  classes = config["data"]["task"]["classes"]
  if isinstance(classes, list):
    if output_index is None or output_index not in range(len(classes)):
      raise IndexError("output_index:{} not in the range of classes length: "
                       "{}!".format(output_index, len(classes)))
    num_classes = classes[output_index]["num_classes"]
    label_vocab_file_path = config["data"]["task"]["label_vocab"][output_index]
  else:
    num_classes = classes["num_classes"]
    label_vocab_file_path = config["data"]["task"]["label_vocab"]
  label_ds = tf.data.Dataset.from_tensor_slices(label_placeholder)

  label_ds = label_ds.map(
      lambda x: tokenize_label(
          x, maxlen=1, label_vocab_file_path=label_vocab_file_path, pad_id=0),
      num_parallel_calls=num_parallel_calls)

  label_ds = label_ds.map(
      lambda l: tf.one_hot(l, num_classes, dtype=tf.int32),
      num_parallel_calls=num_parallel_calls)

  label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls)

  return label_ds
  def test_label_and_text(self):
    text = ["O O"]
    maxlen = 2
    text_tokenize_t = tokenize_sentence(text, maxlen, self.vocab_text_filepath)
    label = ["B B"]
    maxlen = 2
    label_tokenize_t = tokenize_label(label, maxlen, self.vocab_label_filepath,
                                      -1)

    with self.cached_session(use_gpu=False, force_gpu=False) as sess:
      res = sess.run([text_tokenize_t, label_tokenize_t])
      logging.debug(res)
      self.assertAllEqual(res[0], [[3, 3]])
      self.assertAllEqual(res[1], [[0, 0]])
Exemple #4
0
def load_multi_label_dataset(label, config):
    """Load multi-label data set."""
    logging.info("Loading multi label dataset...")
    label_vocab_file_path = config["data"]["task"]["label_vocab"]
    num_parallel_calls = config["data"]["task"]["num_parallel_calls"]
    max_seq_len = config["data"]["task"]["max_seq_len"]
    label_ds = tf.data.Dataset.from_tensor_slices(label)
    label_ds = label_ds.map(
        lambda x: tokenize_label(x,
                                 maxlen=max_seq_len,
                                 label_vocab_file_path=label_vocab_file_path,
                                 pad_id=0),
        num_parallel_calls=num_parallel_calls)
    label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls)

    return label_ds
Exemple #5
0
def load_one_label_dataset(label, config):
    """Load one-label data set."""
    logging.info("Loading one label dataset...")
    label_vocab_file_path = config["data"]["task"]["label_vocab"]
    num_classes = config["data"]["task"]["classes"]["num_classes"]
    num_parallel_calls = config["data"]["task"]["num_parallel_calls"]
    label_ds = tf.data.Dataset.from_tensor_slices(label)

    label_ds = label_ds.map(lambda x: tokenize_label(
        x, maxlen=1, label_vocab_file_path=label_vocab_file_path, pad_id=0),
                            num_parallel_calls=num_parallel_calls)

    label_ds = label_ds.map(
        lambda l: tf.one_hot(l, num_classes, dtype=tf.int32),
        num_parallel_calls=num_parallel_calls)

    label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls)

    return label_ds