def load_multi_label_dataset(label_placeholder, config, output_index=None): """Load multi-label data set.""" logging.info("Loading multi label dataset...") label_vocab_file_path = config["data"]["task"]["label_vocab"] num_parallel_calls = config["data"]["task"]["num_parallel_calls"] max_seq_len = config["data"]["task"]["max_seq_len"] label_vocab_file_path = config["data"]["task"]["label_vocab"] if isinstance(label_vocab_file_path, list): if output_index is None or output_index not in range( len(label_vocab_file_path)): raise IndexError("output_index:{} not in the range of classes length: " "{}!".format(output_index, len(label_vocab_file_path))) label_vocab_file_path = label_vocab_file_path[output_index] else: label_vocab_file_path = label_vocab_file_path label_ds = tf.data.Dataset.from_tensor_slices(label_placeholder) label_ds = label_ds.map( lambda x: tokenize_label( x, maxlen=max_seq_len, label_vocab_file_path=label_vocab_file_path, pad_id=0), num_parallel_calls=num_parallel_calls) label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls) return label_ds
def load_one_label_dataset(label_placeholder, config, output_index=None): """Load one-label data set.""" logging.info("Loading one label dataset...") num_parallel_calls = config["data"]["task"]["num_parallel_calls"] classes = config["data"]["task"]["classes"] if isinstance(classes, list): if output_index is None or output_index not in range(len(classes)): raise IndexError("output_index:{} not in the range of classes length: " "{}!".format(output_index, len(classes))) num_classes = classes[output_index]["num_classes"] label_vocab_file_path = config["data"]["task"]["label_vocab"][output_index] else: num_classes = classes["num_classes"] label_vocab_file_path = config["data"]["task"]["label_vocab"] label_ds = tf.data.Dataset.from_tensor_slices(label_placeholder) label_ds = label_ds.map( lambda x: tokenize_label( x, maxlen=1, label_vocab_file_path=label_vocab_file_path, pad_id=0), num_parallel_calls=num_parallel_calls) label_ds = label_ds.map( lambda l: tf.one_hot(l, num_classes, dtype=tf.int32), num_parallel_calls=num_parallel_calls) label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls) return label_ds
def test_label_and_text(self): text = ["O O"] maxlen = 2 text_tokenize_t = tokenize_sentence(text, maxlen, self.vocab_text_filepath) label = ["B B"] maxlen = 2 label_tokenize_t = tokenize_label(label, maxlen, self.vocab_label_filepath, -1) with self.cached_session(use_gpu=False, force_gpu=False) as sess: res = sess.run([text_tokenize_t, label_tokenize_t]) logging.debug(res) self.assertAllEqual(res[0], [[3, 3]]) self.assertAllEqual(res[1], [[0, 0]])
def load_multi_label_dataset(label, config): """Load multi-label data set.""" logging.info("Loading multi label dataset...") label_vocab_file_path = config["data"]["task"]["label_vocab"] num_parallel_calls = config["data"]["task"]["num_parallel_calls"] max_seq_len = config["data"]["task"]["max_seq_len"] label_ds = tf.data.Dataset.from_tensor_slices(label) label_ds = label_ds.map( lambda x: tokenize_label(x, maxlen=max_seq_len, label_vocab_file_path=label_vocab_file_path, pad_id=0), num_parallel_calls=num_parallel_calls) label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls) return label_ds
def load_one_label_dataset(label, config): """Load one-label data set.""" logging.info("Loading one label dataset...") label_vocab_file_path = config["data"]["task"]["label_vocab"] num_classes = config["data"]["task"]["classes"]["num_classes"] num_parallel_calls = config["data"]["task"]["num_parallel_calls"] label_ds = tf.data.Dataset.from_tensor_slices(label) label_ds = label_ds.map(lambda x: tokenize_label( x, maxlen=1, label_vocab_file_path=label_vocab_file_path, pad_id=0), num_parallel_calls=num_parallel_calls) label_ds = label_ds.map( lambda l: tf.one_hot(l, num_classes, dtype=tf.int32), num_parallel_calls=num_parallel_calls) label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls) return label_ds