def testCaptureHashTable(self): # NOTE(mrry): We must use the V2 variants of `HashTable` # etc. because these produce a `tf.resource`-typed output that is # compatible with the in-graph function implementation. default_val = -1 keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([0, 1, 2], dtypes.int64) table = lookup_ops.HashTable( lookup_ops.KeyValueTensorInitializer(keys, values), default_val) input_sentences = dataset_ops.Dataset.from_tensor_slices( ["brain brain tank salad surgery", "surgery brain"]) dataset = input_sentences.map( lambda x: string_ops.string_split([x]).values).map(table.lookup) get_next = self.getNext(dataset, requires_initialization=True) self.evaluate(table.initializer) self.evaluate(get_next()) self.evaluate(get_next()) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())
def collecting_function(x): _ = lookup_ops.HashTable( lookup_ops.KeyValueTensorInitializer([], []), 0.0, name="t1") return x
def skip_gram_sample_with_text_vocab(input_tensor, vocab_freq_file, vocab_token_index=0, vocab_token_dtype=tf.dtypes.string, vocab_freq_index=1, vocab_freq_dtype=tf.dtypes.float64, vocab_delimiter=",", vocab_min_count=0, vocab_subsampling=None, corpus_size=None, min_skips=1, max_skips=5, start=0, limit=-1, emit_self_as_target=False, batch_size=None, batch_capacity=None, seed=None, name=None): """Skip-gram sampling with a text vocabulary file. Wrapper around `skip_gram_sample()` for use with a text vocabulary file. The vocabulary file is expected to be a plain-text file, with lines of `vocab_delimiter`-separated columns. The `vocab_token_index` column should contain the vocabulary term, while the `vocab_freq_index` column should contain the number of times that term occurs in the corpus. For example, with a text vocabulary file of: ``` bonjour,fr,42 hello,en,777 hola,es,99 ``` You should set `vocab_delimiter=","`, `vocab_token_index=0`, and `vocab_freq_index=2`. See `skip_gram_sample()` documentation for more details about the skip-gram sampling process. Args: input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates. vocab_freq_file: `string` specifying full file path to the text vocab file. vocab_token_index: `int` specifying which column in the text vocab file contains the tokens. vocab_token_dtype: `DType` specifying the format of the tokens in the text vocab file. vocab_freq_index: `int` specifying which column in the text vocab file contains the frequency counts of the tokens. vocab_freq_dtype: `DType` specifying the format of the frequency counts in the text vocab file. vocab_delimiter: `string` specifying the delimiter used in the text vocab file. vocab_min_count: `int`, `float`, or scalar `Tensor` specifying minimum frequency threshold (from `vocab_freq_file`) for a token to be kept in `input_tensor`. This should correspond with `vocab_freq_dtype`. vocab_subsampling: (Optional) `float` specifying frequency proportion threshold for tokens from `input_tensor`. Tokens that occur more frequently will be randomly down-sampled. Reasonable starting values may be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details. corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the total number of tokens in the corpus (e.g., sum of all the frequency counts of `vocab_freq_file`). Used with `vocab_subsampling` for down-sampling frequently occurring tokens. If this is specified, `vocab_freq_file` and `vocab_subsampling` must also be specified. If `corpus_size` is needed but not supplied, then it will be calculated from `vocab_freq_file`. You might want to supply your own value if you have already eliminated infrequent tokens from your vocabulary files (where frequency < vocab_min_count) to save memory in the internal token lookup table. Otherwise, the unused tokens' variables will waste memory. The user-supplied `corpus_size` value must be greater than or equal to the sum of all the frequency counts of `vocab_freq_file`. min_skips: `int` or scalar `Tensor` specifying the minimum window size to randomly use for each token. Must be >= 0 and <= `max_skips`. If `min_skips` and `max_skips` are both 0, the only label outputted will be the token itself. max_skips: `int` or scalar `Tensor` specifying the maximum window size to randomly use for each token. Must be >= 0. start: `int` or scalar `Tensor` specifying the position in `input_tensor` from which to start generating skip-gram candidates. limit: `int` or scalar `Tensor` specifying the maximum number of elements in `input_tensor` to use in generating skip-gram candidates. -1 means to use the rest of the `Tensor` after `start`. emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit each token as a label for itself. batch_size: (Optional) `int` specifying batch size of returned `Tensors`. batch_capacity: (Optional) `int` specifying batch capacity for the queue used for batching returned `Tensors`. Only has an effect if `batch_size` > 0. Defaults to 100 * `batch_size` if not specified. seed: (Optional) `int` used to create a random seed for window size and subsampling. See [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed) for behavior. name: (Optional) A `string` name or a name scope for the operations. Returns: A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of rank-1 and has the same type as `input_tensor`. The `Tensors` will be of length `batch_size`; if `batch_size` is not specified, they will be of random length, though they will be in sync with each other as long as they are evaluated together. Raises: ValueError: If `vocab_token_index` or `vocab_freq_index` is less than 0 or exceeds the number of columns in `vocab_freq_file`. If `vocab_token_index` and `vocab_freq_index` are both set to the same column. If any token in `vocab_freq_file` has a negative frequency. """ if vocab_token_index < 0 or vocab_freq_index < 0: raise ValueError( "vocab_token_index={} and vocab_freq_index={} must both be >= 0.". format(vocab_token_index, vocab_freq_index)) if vocab_token_index == vocab_freq_index: raise ValueError( "vocab_token_index and vocab_freq_index should be different, but are " "both {}.".format(vocab_token_index)) # Iterates through the vocab file and calculates the number of vocab terms as # well as the total corpus size (by summing the frequency counts of all the # vocab terms). calculated_corpus_size = 0.0 vocab_size = 0 with tf.io.gfile.GFile(vocab_freq_file, mode="r") as f: reader = csv.reader(f, delimiter=vocab_delimiter) for row in reader: if vocab_token_index >= len(row) or vocab_freq_index >= len(row): raise ValueError( "Row in vocab file only has {} columns, so vocab_token_index={} or " "vocab_freq_index={} is out of bounds. Row content: {}". format(len(row), vocab_token_index, vocab_freq_index, row)) vocab_size += 1 freq = vocab_freq_dtype.as_numpy_dtype(row[vocab_freq_index]) if freq < 0: raise ValueError( "Row in vocab file has negative frequency of {}. Row content: {}" .format(freq, row)) # Note: tokens whose frequencies are below vocab_min_count will still # contribute to the total corpus size used for vocab subsampling. calculated_corpus_size += freq if not corpus_size: corpus_size = calculated_corpus_size elif calculated_corpus_size - corpus_size > 1e-6: raise ValueError( "`corpus_size`={} must be greater than or equal to the sum of all the " "frequency counts ({}) of `vocab_freq_file` ({}).".format( corpus_size, calculated_corpus_size, vocab_freq_file)) vocab_freq_table = lookup_ops.HashTable( lookup_ops.TextFileInitializer(filename=vocab_freq_file, key_dtype=vocab_token_dtype, key_index=vocab_token_index, value_dtype=vocab_freq_dtype, value_index=vocab_freq_index, vocab_size=vocab_size, delimiter=vocab_delimiter), # For vocab terms not in vocab file, use a default value of -1. default_value=-1) return skip_gram_sample( input_tensor, min_skips=min_skips, max_skips=max_skips, start=start, limit=limit, emit_self_as_target=emit_self_as_target, vocab_freq_table=vocab_freq_table, vocab_min_count=vocab_min_count, vocab_subsampling=vocab_subsampling, # corpus_size is not used unless vocab_subsampling is specified. corpus_size=None if vocab_subsampling is None else corpus_size, batch_size=batch_size, batch_capacity=batch_capacity, seed=seed, name=name)
def build_dataset(file_pattern, input_config, batch_size, include_labels=True, reverse_time_series_prob=0, shuffle_filenames=False, shuffle_values_buffer=0, repeat=1, use_tpu=False): """Builds an input pipeline that reads a dataset from sharded TFRecord files. Args: file_pattern: File pattern matching input TFRecord files, e.g. "/tmp/train-?????-of-00100". May also be a comma-separated list of file patterns. input_config: ConfigDict containing feature and label specifications. batch_size: The number of examples per batch. include_labels: Whether to read labels from the input files. reverse_time_series_prob: If > 0, the time series features will be randomly reversed with this probability. Within a given example, either all time series features will be reversed, or none will be reversed. shuffle_filenames: Whether to shuffle the order of TFRecord files between epochs. shuffle_values_buffer: If > 0, shuffle examples using a buffer of this size. repeat: The number of times to repeat the dataset. If None or -1 the dataset will repeat indefinitely. use_tpu: Whether to build the dataset for TPU. Raises: ValueError: If an input file pattern does not match any files, or if the label IDs in input_config.label_map are not contiguous integers starting at 0. Returns: A tf.data.Dataset object. """ file_patterns = file_pattern.split(",") filenames = [] for p in file_patterns: matches = tf.io.gfile.glob(p) if not matches: raise ValueError("Found no input files matching %s" % p) filenames.extend(matches) tf.compat.v1.logging.info("Building input pipeline from %d files matching patterns: %s", len(filenames), file_patterns) if include_labels: # Ensure that the label ids are contiguous integers starting at 0. label_ids = set(input_config.label_map.values()) if label_ids != set(range(len(label_ids))): raise ValueError( "Label IDs must be contiguous integers starting at 0. Got: %s" % label_ids) # Create a HashTable mapping label strings to integer ids. table_initializer = lookup_ops.KeyValueTensorInitializer( keys=list(input_config.label_map.keys()), values=list(input_config.label_map.values()), key_dtype=tf.string, value_dtype=tf.int32) label_to_id = lookup_ops.HashTable( table_initializer, default_value=-1) def _example_parser(serialized_example): """Parses a single tf.Example into feature and label tensors.""" # Set specifications for parsing the features. data_fields = { feature_name: tf.io.FixedLenFeature([feature.length], tf.float32) for feature_name, feature in input_config.features.items() } if include_labels: data_fields[input_config.label_feature] = tf.io.FixedLenFeature([], tf.string) # Parse the features. parsed_features = tf.io.parse_single_example( serialized=serialized_example, features=data_fields) if reverse_time_series_prob > 0: # Randomly reverse time series features with probability # reverse_time_series_prob. should_reverse = tf.less( tf.random.uniform([], 0, 1), reverse_time_series_prob, name="should_reverse") # Reorganize outputs. output = {} for feature_name, value in parsed_features.items(): if include_labels and feature_name == input_config.label_feature: label_id = label_to_id.lookup(value) # Ensure that the label_id is nonnegative to verify a successful hash # map lookup. assert_known_label = tf.Assert( tf.greater_equal(label_id, tf.cast(0, dtype=tf.int32)), ["Unknown label string:", value]) with tf.control_dependencies([assert_known_label]): label_id = tf.identity(label_id) # We use the plural name "labels" in the output due to batching. output["labels"] = label_id elif input_config.features[feature_name].is_time_series: # Possibly reverse. if reverse_time_series_prob > 0: # pylint:disable=cell-var-from-loop value = tf.cond(pred=should_reverse, true_fn=lambda: tf.reverse(value, axis=[0]), false_fn=lambda: tf.identity(value)) # pylint:enable=cell-var-from-loop if "time_series_features" not in output: output["time_series_features"] = {} output["time_series_features"][feature_name] = value else: if "aux_features" not in output: output["aux_features"] = {} output["aux_features"][feature_name] = value return output # Create a string dataset of filenames, and possibly shuffle. filename_dataset = tf.data.Dataset.from_tensor_slices(filenames) if len(filenames) > 1 and shuffle_filenames: filename_dataset = filename_dataset.shuffle(len(filenames)) # Read serialized Example protos. dataset = filename_dataset.flat_map(tf.data.TFRecordDataset) # Possibly shuffle. Note that we shuffle before repeat(), so we only shuffle # elements among each "epoch" of data, and not across epochs of data. if shuffle_values_buffer > 0: dataset = dataset.shuffle(shuffle_values_buffer) # Repeat. if repeat != 1: dataset = dataset.repeat(repeat) # Map the parser over the dataset. dataset = dataset.map(_example_parser, num_parallel_calls=4) # Batch results by up to batch_size. dataset = dataset.batch(batch_size) if repeat == -1 or repeat is None: # The dataset repeats infinitely before batching, so each batch has the # maximum number of elements. dataset = set_batch_size(dataset, batch_size) elif use_tpu: # TPU requires all dimensions to be fixed. Since the dataset does not repeat # infinitely before batching, the final batch may have fewer than batch_size # elements. Therefore we pad to ensure that the final batch has batch_size # elements. dataset = pad_dataset_to_batch_size(dataset, batch_size) # Prefetch a few batches. dataset = dataset.prefetch(max(1, int(256 / batch_size))) return dataset
def create_infer_model(model_creator, hparams, scope=None, extra_args=None): """Create inference model.""" graph = tf.Graph() src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file # REvo added tgt_table = codecs.open(src_vocab_file, 'r').readlines() tmp_ids = [] tmp_words = [] for i in range(len(tgt_table)): tmp_ids.append(i) tmp_words.append(tgt_table[i].strip()) with graph.as_default(), tf.container(scope or "infer"): # Constant vocab table src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( src_vocab_file, tgt_vocab_file, hparams.share_vocab) # reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file( # tgt_vocab_file, default_value=vocab_utils.UNK, name="reverse_table") # added vals = tf.constant(tmp_words, dtype=tf.string) keys = tf.constant(tmp_ids, dtype=tf.int64) reverse_tgt_vocab_table = lookup_ops.HashTable( lookup_ops.KeyValueTensorInitializer(keys, vals), "<unk>", name="reverse_table") # # debug print("SRC:", src_vocab_table) print("SRC type:", type(src_vocab_table)) # src_placeholder = tf.placeholder(shape=[None], dtype=tf.string, name="src_place") # batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64, name="batch_place") batch_size_placeholder = tf.constant(1, dtype=tf.int64, name="batch_place") src_dataset = tf.data.Dataset.from_tensor_slices(src_placeholder) iterator = iterator_utils.get_infer_iterator( src_dataset, src_vocab_table, batch_size=batch_size_placeholder, eos=hparams.eos, src_max_len=hparams.src_max_len_infer) model = model_creator( hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.INFER, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, reverse_target_vocab_table=reverse_tgt_vocab_table, scope=scope, extra_args=extra_args) # Debug # with tf.Session() as sess: # # init # sess.run( # iterator.initializer, # feed_dict={ # src_placeholder: iterator.infer_data, # batch_size_placeholder: 64 # }) # value = sess.run(iterator.source) # print ("value:", value) # sys.exit() return InferModel(graph=graph, model=model, src_placeholder=src_placeholder, batch_size_placeholder=batch_size_placeholder, iterator=iterator, insert_op=(src_vocab_table.init, tgt_vocab_table.init, reverse_tgt_vocab_table.init))