def _build(self, features, labels): # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection({ "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab }, "vocab_tables") # Slice source to max_len if self.max_seq_len_source is not None: features["source_tokens"] = features[ "source_tokens"][:, :self.max_seq_len_source] features["source_len"] = tf.minimum( features["source_len"], self.max_seq_len_source) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup(features[ "source_tokens"]) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.max_seq_len_target is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.max_seq_len_target] labels["target_len"] = tf.minimum( labels["target_len"], self.max_seq_len_target) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) return features, labels
def test_with_counts(self): vocab_list = ["Hello", ".", "笑"] vocab_counts = [100, 200, 300] vocab_file = test_utils.create_temporary_vocab_file(vocab_list, vocab_counts) vocab_to_id_table, id_to_vocab_table, word_to_count_table, vocab_size = \ vocab.create_vocabulary_lookup_table(vocab_file.name) self.assertEqual(vocab_size, 6) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) ids = vocab_to_id_table.lookup( tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"])) ids = sess.run(ids) np.testing.assert_array_equal(ids, [0, 1, 2, 3, 3]) words = id_to_vocab_table.lookup( tf.convert_to_tensor( [0, 1, 2, 3], dtype=tf.int64)) words = sess.run(words) np.testing.assert_array_equal( np.char.decode(words.astype("S"), "utf-8"), ["Hello", ".", "笑", "UNK"]) counts = word_to_count_table.lookup( tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"])) counts = sess.run(counts) np.testing.assert_array_equal(counts, [100, 200, 300, -1, -1])
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for target vocab - Converts tokens into vocabulary ids - Prepends a speical "SEQUENCE_START" token to the target - Appends a speical "SEQUENCE_END" token to the target """ # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection( { "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.params["target.max_seq_len"]] labels["target_len"] = tf.minimum( labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup( labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for target vocab - Converts tokens into vocabulary ids - Prepends a speical "SEQUENCE_START" token to the target - Appends a speical "SEQUENCE_END" token to the target """ # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection({ "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels["target_tokens"][:, :self.params[ "target.max_seq_len"]] labels["target_len"] = tf.minimum(labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def test_lookup_table(self): vocab_to_id_table, id_to_vocab_table, vocab_size = \ vocab.create_vocabulary_lookup_table(self.vocab_file.name) self.assertEqual(vocab_size, 6) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) ids = vocab_to_id_table.lookup( tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"])) ids = sess.run(ids) np.testing.assert_array_equal(ids, [0, 1, 2, 3, 3]) words = id_to_vocab_table.lookup( tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64)) words = sess.run(words) np.testing.assert_array_equal( np.char.decode(words.astype("S"), "utf-8"), ["Hello", ".", "笑", "UNK"])
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids """ # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection( { "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") # Slice source to max_len if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features[ "source_tokens"][:, :self.params["source.max_seq_len"]] features["source_len"] = tf.minimum( features["source_len"], self.params["source.max_seq_len"]) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup( features["source_tokens"]) # Maybe reverse the source if self.params["source.reverse"] is True: features["source_ids"] = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.params["target.max_seq_len"]] labels["target_len"] = tf.minimum( labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup( labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Keep track of the number of processed tokens num_tokens = tf.reduce_sum(labels["target_len"]) num_tokens += tf.reduce_sum(features["source_len"]) token_counter_var = tf.Variable(0, "tokens_counter") total_tokens = tf.assign_add(token_counter_var, num_tokens) tf.summary.scalar("num_tokens", total_tokens) with tf.control_dependencies([total_tokens]): features["source_tokens"] = tf.identity(features["source_tokens"]) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids - Appends a special "SEQUENCE_END" token to the source - Prepends a special "SEQUENCE_START" token to the target - Appends a special "SEQUENCE_END" token to the target """ # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection( { "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") # Slice source to max_len if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features[ "source_tokens"][:, :self.params["source.max_seq_len"]] features["source_len"] = tf.minimum( features["source_len"], self.params["source.max_seq_len"]) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup( features["source_tokens"]) # Maybe reverse the source if self.params["source.reverse"] is True: features["source_ids"] = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.params["target.max_seq_len"]] labels["target_len"] = tf.minimum( labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup( labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids """ # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection({ "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") # Slice source to max_len if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features["source_tokens"][:, :self.params[ "source.max_seq_len"]] features["source_len"] = tf.minimum(features["source_len"], self.params["source.max_seq_len"]) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup(features[ "source_tokens"]) # Maybe reverse the source if self.params["source.reverse"] is True: features["source_ids"] = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels["target_tokens"][:, :self.params[ "target.max_seq_len"]] labels["target_len"] = tf.minimum(labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Keep track of the number of processed tokens num_tokens = tf.reduce_sum(labels["target_len"]) num_tokens += tf.reduce_sum(features["source_len"]) token_counter_var = tf.Variable(0, "tokens_counter") total_tokens = tf.assign_add(token_counter_var, num_tokens) tf.summary.scalar("num_tokens", total_tokens) with tf.control_dependencies([total_tokens]): features["source_tokens"] = tf.identity(features["source_tokens"]) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def t(file_path, default_value=None): x = create_vocabulary_lookup_table(file_path) print(x) return x