Ejemplo n.º 1
0
    def testCaptureHashTable(self):
        # NOTE(mrry): We must use the V2 variants of `HashTable`
        # etc. because these produce a `tf.resource`-typed output that is
        # compatible with the in-graph function implementation.
        default_val = -1
        keys = constant_op.constant(["brain", "salad", "surgery"])
        values = constant_op.constant([0, 1, 2], dtypes.int64)
        table = lookup_ops.HashTable(
            lookup_ops.KeyValueTensorInitializer(keys, values), default_val)

        input_sentences = dataset_ops.Dataset.from_tensor_slices(
            ["brain brain tank salad surgery", "surgery brain"])

        dataset = input_sentences.map(
            lambda x: string_ops.string_split([x]).values).map(table.lookup)

        get_next = self.getNext(dataset, requires_initialization=True)

        self.evaluate(table.initializer)
        self.evaluate(get_next())
        self.evaluate(get_next())
        with self.assertRaises(errors.OutOfRangeError):
            self.evaluate(get_next())
 def collecting_function(x):
   _ = lookup_ops.HashTable(
       lookup_ops.KeyValueTensorInitializer([], []), 0.0, name="t1")
   return x
Ejemplo n.º 3
0
def skip_gram_sample_with_text_vocab(input_tensor,
                                     vocab_freq_file,
                                     vocab_token_index=0,
                                     vocab_token_dtype=tf.dtypes.string,
                                     vocab_freq_index=1,
                                     vocab_freq_dtype=tf.dtypes.float64,
                                     vocab_delimiter=",",
                                     vocab_min_count=0,
                                     vocab_subsampling=None,
                                     corpus_size=None,
                                     min_skips=1,
                                     max_skips=5,
                                     start=0,
                                     limit=-1,
                                     emit_self_as_target=False,
                                     batch_size=None,
                                     batch_capacity=None,
                                     seed=None,
                                     name=None):
    """Skip-gram sampling with a text vocabulary file.

    Wrapper around `skip_gram_sample()` for use with a text vocabulary file. The
    vocabulary file is expected to be a plain-text file, with lines of
    `vocab_delimiter`-separated columns. The `vocab_token_index` column should
    contain the vocabulary term, while the `vocab_freq_index` column should
    contain the number of times that term occurs in the corpus. For example, with
    a text vocabulary file of:

      ```
      bonjour,fr,42
      hello,en,777
      hola,es,99
      ```

    You should set `vocab_delimiter=","`, `vocab_token_index=0`, and
    `vocab_freq_index=2`.

    See `skip_gram_sample()` documentation for more details about the skip-gram
    sampling process.

    Args:
      input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates.
      vocab_freq_file: `string` specifying full file path to the text vocab file.
      vocab_token_index: `int` specifying which column in the text vocab file
        contains the tokens.
      vocab_token_dtype: `DType` specifying the format of the tokens in the text
        vocab file.
      vocab_freq_index: `int` specifying which column in the text vocab file
        contains the frequency counts of the tokens.
      vocab_freq_dtype: `DType` specifying the format of the frequency counts in
        the text vocab file.
      vocab_delimiter: `string` specifying the delimiter used in the text vocab
        file.
      vocab_min_count: `int`, `float`, or scalar `Tensor` specifying
        minimum frequency threshold (from `vocab_freq_file`) for a token to be
        kept in `input_tensor`. This should correspond with `vocab_freq_dtype`.
      vocab_subsampling: (Optional) `float` specifying frequency proportion
        threshold for tokens from `input_tensor`. Tokens that occur more
        frequently will be randomly down-sampled. Reasonable starting values may
        be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546 for
        more details.
      corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
        total number of tokens in the corpus (e.g., sum of all the frequency
        counts of `vocab_freq_file`). Used with `vocab_subsampling` for
        down-sampling frequently occurring tokens. If this is specified,
        `vocab_freq_file` and `vocab_subsampling` must also be specified.
        If `corpus_size` is needed but not supplied, then it will be calculated
        from `vocab_freq_file`. You might want to supply your own value if you
        have already eliminated infrequent tokens from your vocabulary files
        (where frequency < vocab_min_count) to save memory in the internal token
        lookup table. Otherwise, the unused tokens' variables will waste memory.
        The user-supplied `corpus_size` value must be greater than or equal to the
        sum of all the frequency counts of `vocab_freq_file`.
      min_skips: `int` or scalar `Tensor` specifying the minimum window size to
        randomly use for each token. Must be >= 0 and <= `max_skips`. If
        `min_skips` and `max_skips` are both 0, the only label outputted will be
        the token itself.
      max_skips: `int` or scalar `Tensor` specifying the maximum window size to
        randomly use for each token. Must be >= 0.
      start: `int` or scalar `Tensor` specifying the position in `input_tensor`
        from which to start generating skip-gram candidates.
      limit: `int` or scalar `Tensor` specifying the maximum number of elements in
        `input_tensor` to use in generating skip-gram candidates. -1 means to use
        the rest of the `Tensor` after `start`.
      emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
        each token as a label for itself.
      batch_size: (Optional) `int` specifying batch size of returned `Tensors`.
      batch_capacity: (Optional) `int` specifying batch capacity for the queue
        used for batching returned `Tensors`. Only has an effect if
        `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
      seed: (Optional) `int` used to create a random seed for window size and
        subsampling. See
        [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed)
        for behavior.
      name: (Optional) A `string` name or a name scope for the operations.

    Returns:
      A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
      rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
      length `batch_size`; if `batch_size` is not specified, they will be of
      random length, though they will be in sync with each other as long as they
      are evaluated together.

    Raises:
      ValueError: If `vocab_token_index` or `vocab_freq_index` is less than 0 or
        exceeds the number of columns in `vocab_freq_file`. If `vocab_token_index`
        and `vocab_freq_index` are both set to the same column. If any token in
        `vocab_freq_file` has a negative frequency.
    """

    if vocab_token_index < 0 or vocab_freq_index < 0:
        raise ValueError(
            "vocab_token_index={} and vocab_freq_index={} must both be >= 0.".
            format(vocab_token_index, vocab_freq_index))
    if vocab_token_index == vocab_freq_index:
        raise ValueError(
            "vocab_token_index and vocab_freq_index should be different, but are "
            "both {}.".format(vocab_token_index))

    # Iterates through the vocab file and calculates the number of vocab terms as
    # well as the total corpus size (by summing the frequency counts of all the
    # vocab terms).
    calculated_corpus_size = 0.0
    vocab_size = 0
    with tf.io.gfile.GFile(vocab_freq_file, mode="r") as f:
        reader = csv.reader(f, delimiter=vocab_delimiter)
        for row in reader:
            if vocab_token_index >= len(row) or vocab_freq_index >= len(row):
                raise ValueError(
                    "Row in vocab file only has {} columns, so vocab_token_index={} or "
                    "vocab_freq_index={} is out of bounds. Row content: {}".
                    format(len(row), vocab_token_index, vocab_freq_index, row))
            vocab_size += 1
            freq = vocab_freq_dtype.as_numpy_dtype(row[vocab_freq_index])
            if freq < 0:
                raise ValueError(
                    "Row in vocab file has negative frequency of {}. Row content: {}"
                    .format(freq, row))
            # Note: tokens whose frequencies are below vocab_min_count will still
            # contribute to the total corpus size used for vocab subsampling.
            calculated_corpus_size += freq

    if not corpus_size:
        corpus_size = calculated_corpus_size
    elif calculated_corpus_size - corpus_size > 1e-6:
        raise ValueError(
            "`corpus_size`={} must be greater than or equal to the sum of all the "
            "frequency counts ({}) of `vocab_freq_file` ({}).".format(
                corpus_size, calculated_corpus_size, vocab_freq_file))

    vocab_freq_table = lookup_ops.HashTable(
        lookup_ops.TextFileInitializer(filename=vocab_freq_file,
                                       key_dtype=vocab_token_dtype,
                                       key_index=vocab_token_index,
                                       value_dtype=vocab_freq_dtype,
                                       value_index=vocab_freq_index,
                                       vocab_size=vocab_size,
                                       delimiter=vocab_delimiter),
        # For vocab terms not in vocab file, use a default value of -1.
        default_value=-1)

    return skip_gram_sample(
        input_tensor,
        min_skips=min_skips,
        max_skips=max_skips,
        start=start,
        limit=limit,
        emit_self_as_target=emit_self_as_target,
        vocab_freq_table=vocab_freq_table,
        vocab_min_count=vocab_min_count,
        vocab_subsampling=vocab_subsampling,
        # corpus_size is not used unless vocab_subsampling is specified.
        corpus_size=None if vocab_subsampling is None else corpus_size,
        batch_size=batch_size,
        batch_capacity=batch_capacity,
        seed=seed,
        name=name)
Ejemplo n.º 4
0
def build_dataset(file_pattern,
                  input_config,
                  batch_size,
                  include_labels=True,
                  reverse_time_series_prob=0,
                  shuffle_filenames=False,
                  shuffle_values_buffer=0,
                  repeat=1,
                  use_tpu=False):
    """Builds an input pipeline that reads a dataset from sharded TFRecord files.

    Args:
      file_pattern: File pattern matching input TFRecord files, e.g.
          "/tmp/train-?????-of-00100". May also be a comma-separated list of file
          patterns.
      input_config: ConfigDict containing feature and label specifications.
      batch_size: The number of examples per batch.
      include_labels: Whether to read labels from the input files.
      reverse_time_series_prob: If > 0, the time series features will be randomly
          reversed with this probability. Within a given example, either all time
          series features will be reversed, or none will be reversed.
      shuffle_filenames: Whether to shuffle the order of TFRecord files between
          epochs.
      shuffle_values_buffer: If > 0, shuffle examples using a buffer of this size.
      repeat: The number of times to repeat the dataset. If None or -1 the dataset
          will repeat indefinitely.
      use_tpu: Whether to build the dataset for TPU.

    Raises:
      ValueError: If an input file pattern does not match any files, or if the
          label IDs in input_config.label_map are not contiguous integers starting
          at 0.

    Returns:
      A tf.data.Dataset object.
    """
    file_patterns = file_pattern.split(",")
    filenames = []
    for p in file_patterns:
        matches = tf.io.gfile.glob(p)
        if not matches:
            raise ValueError("Found no input files matching %s" % p)
        filenames.extend(matches)
    tf.compat.v1.logging.info("Building input pipeline from %d files matching patterns: %s",
                              len(filenames), file_patterns)

    if include_labels:
        # Ensure that the label ids are contiguous integers starting at 0.
        label_ids = set(input_config.label_map.values())
        if label_ids != set(range(len(label_ids))):
            raise ValueError(
                "Label IDs must be contiguous integers starting at 0. Got: %s" %
                label_ids)

        # Create a HashTable mapping label strings to integer ids.
        table_initializer = lookup_ops.KeyValueTensorInitializer(
            keys=list(input_config.label_map.keys()),
            values=list(input_config.label_map.values()),
            key_dtype=tf.string,
            value_dtype=tf.int32)
        label_to_id = lookup_ops.HashTable(
            table_initializer, default_value=-1)

    def _example_parser(serialized_example):
        """Parses a single tf.Example into feature and label tensors."""
        # Set specifications for parsing the features.
        data_fields = {
            feature_name: tf.io.FixedLenFeature([feature.length], tf.float32)
            for feature_name, feature in input_config.features.items()
        }
        if include_labels:
            data_fields[input_config.label_feature] = tf.io.FixedLenFeature([],
                                                                            tf.string)

        # Parse the features.
        parsed_features = tf.io.parse_single_example(
            serialized=serialized_example, features=data_fields)

        if reverse_time_series_prob > 0:
            # Randomly reverse time series features with probability
            # reverse_time_series_prob.
            should_reverse = tf.less(
                tf.random.uniform([], 0, 1),
                reverse_time_series_prob,
                name="should_reverse")

        # Reorganize outputs.
        output = {}
        for feature_name, value in parsed_features.items():
            if include_labels and feature_name == input_config.label_feature:
                label_id = label_to_id.lookup(value)
                # Ensure that the label_id is nonnegative to verify a successful hash
                # map lookup.
                assert_known_label = tf.Assert(
                    tf.greater_equal(label_id, tf.cast(0, dtype=tf.int32)),
                    ["Unknown label string:", value])
                with tf.control_dependencies([assert_known_label]):
                    label_id = tf.identity(label_id)

                # We use the plural name "labels" in the output due to batching.
                output["labels"] = label_id
            elif input_config.features[feature_name].is_time_series:
                # Possibly reverse.
                if reverse_time_series_prob > 0:
                    # pylint:disable=cell-var-from-loop
                    value = tf.cond(pred=should_reverse, true_fn=lambda: tf.reverse(value, axis=[0]),
                                    false_fn=lambda: tf.identity(value))
                    # pylint:enable=cell-var-from-loop
                if "time_series_features" not in output:
                    output["time_series_features"] = {}
                output["time_series_features"][feature_name] = value
            else:
                if "aux_features" not in output:
                    output["aux_features"] = {}
                output["aux_features"][feature_name] = value

        return output

    # Create a string dataset of filenames, and possibly shuffle.
    filename_dataset = tf.data.Dataset.from_tensor_slices(filenames)
    if len(filenames) > 1 and shuffle_filenames:
        filename_dataset = filename_dataset.shuffle(len(filenames))

    # Read serialized Example protos.
    dataset = filename_dataset.flat_map(tf.data.TFRecordDataset)

    # Possibly shuffle. Note that we shuffle before repeat(), so we only shuffle
    # elements among each "epoch" of data, and not across epochs of data.
    if shuffle_values_buffer > 0:
        dataset = dataset.shuffle(shuffle_values_buffer)

    # Repeat.
    if repeat != 1:
        dataset = dataset.repeat(repeat)

    # Map the parser over the dataset.
    dataset = dataset.map(_example_parser, num_parallel_calls=4)

    # Batch results by up to batch_size.
    dataset = dataset.batch(batch_size)
    if repeat == -1 or repeat is None:
        # The dataset repeats infinitely before batching, so each batch has the
        # maximum number of elements.
        dataset = set_batch_size(dataset, batch_size)
    elif use_tpu:
        # TPU requires all dimensions to be fixed. Since the dataset does not repeat
        # infinitely before batching, the final batch may have fewer than batch_size
        # elements. Therefore we pad to ensure that the final batch has batch_size
        # elements.
        dataset = pad_dataset_to_batch_size(dataset, batch_size)

    # Prefetch a few batches.
    dataset = dataset.prefetch(max(1, int(256 / batch_size)))

    return dataset
Ejemplo n.º 5
0
def create_infer_model(model_creator, hparams, scope=None, extra_args=None):
    """Create inference model."""
    graph = tf.Graph()
    src_vocab_file = hparams.src_vocab_file
    tgt_vocab_file = hparams.tgt_vocab_file

    # REvo added
    tgt_table = codecs.open(src_vocab_file, 'r').readlines()
    tmp_ids = []
    tmp_words = []
    for i in range(len(tgt_table)):
        tmp_ids.append(i)
        tmp_words.append(tgt_table[i].strip())

    with graph.as_default(), tf.container(scope or "infer"):
        # Constant vocab table
        src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables(
            src_vocab_file, tgt_vocab_file, hparams.share_vocab)

        # reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file(
        #     tgt_vocab_file, default_value=vocab_utils.UNK, name="reverse_table")
        # added
        vals = tf.constant(tmp_words, dtype=tf.string)
        keys = tf.constant(tmp_ids, dtype=tf.int64)
        reverse_tgt_vocab_table = lookup_ops.HashTable(
            lookup_ops.KeyValueTensorInitializer(keys, vals),
            "<unk>",
            name="reverse_table")
        #

        # debug
        print("SRC:", src_vocab_table)
        print("SRC type:", type(src_vocab_table))
        #
        src_placeholder = tf.placeholder(shape=[None],
                                         dtype=tf.string,
                                         name="src_place")
        # batch_size_placeholder = tf.placeholder(shape=[], dtype=tf.int64, name="batch_place")
        batch_size_placeholder = tf.constant(1,
                                             dtype=tf.int64,
                                             name="batch_place")

        src_dataset = tf.data.Dataset.from_tensor_slices(src_placeholder)
        iterator = iterator_utils.get_infer_iterator(
            src_dataset,
            src_vocab_table,
            batch_size=batch_size_placeholder,
            eos=hparams.eos,
            src_max_len=hparams.src_max_len_infer)
        model = model_creator(
            hparams,
            iterator=iterator,
            mode=tf.contrib.learn.ModeKeys.INFER,
            source_vocab_table=src_vocab_table,
            target_vocab_table=tgt_vocab_table,
            reverse_target_vocab_table=reverse_tgt_vocab_table,
            scope=scope,
            extra_args=extra_args)

        # Debug
        # with tf.Session() as sess:
        #     # init
        #     sess.run(
        #         iterator.initializer,
        #         feed_dict={
        #             src_placeholder: iterator.infer_data,
        #             batch_size_placeholder: 64
        #         })
        #     value = sess.run(iterator.source)
        #     print ("value:", value)
        # sys.exit()

    return InferModel(graph=graph,
                      model=model,
                      src_placeholder=src_placeholder,
                      batch_size_placeholder=batch_size_placeholder,
                      iterator=iterator,
                      insert_op=(src_vocab_table.init, tgt_vocab_table.init,
                                 reverse_tgt_vocab_table.init))