Ejemplo n.º 1
0
  def _build(self, features, labels):
    # Create vocabulary lookup for source
    source_vocab_to_id, source_id_to_vocab, _ = \
      vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

    # Create vocabulary look for target
    target_vocab_to_id, target_id_to_vocab, _ = \
      vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

    # Add vocab tables to graph colection so that we can access them in
    # other places.
    graph_utils.add_dict_to_collection({
        "source_vocab_to_id": source_vocab_to_id,
        "source_id_to_vocab": source_id_to_vocab,
        "target_vocab_to_id": target_vocab_to_id,
        "target_id_to_vocab": target_id_to_vocab
    }, "vocab_tables")

    # Slice source to max_len
    if self.max_seq_len_source is not None:
      features["source_tokens"] = features[
          "source_tokens"][:, :self.max_seq_len_source]
      features["source_len"] = tf.minimum(
          features["source_len"], self.max_seq_len_source)

    # Look up the source ids in the vocabulary
    features["source_ids"] = source_vocab_to_id.lookup(features[
        "source_tokens"])

    features["source_len"] = tf.to_int32(features["source_len"])
    tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

    if labels is None:
      return features, None

    labels = labels.copy()

    # Slices targets to max length
    if self.max_seq_len_target is not None:
      labels["target_tokens"] = labels[
          "target_tokens"][:, :self.max_seq_len_target]
      labels["target_len"] = tf.minimum(
          labels["target_len"], self.max_seq_len_target)

    # Look up the target ids in the vocabulary
    labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"])

    labels["target_len"] = tf.to_int32(labels["target_len"])
    tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

    return features, labels
Ejemplo n.º 2
0
  def test_with_counts(self):
    vocab_list = ["Hello", ".", "笑"]
    vocab_counts = [100, 200, 300]
    vocab_file = test_utils.create_temporary_vocab_file(vocab_list,
                                                        vocab_counts)

    vocab_to_id_table, id_to_vocab_table, word_to_count_table, vocab_size = \
      vocab.create_vocabulary_lookup_table(vocab_file.name)

    self.assertEqual(vocab_size, 6)

    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      sess.run(tf.local_variables_initializer())
      sess.run(tf.tables_initializer())

      ids = vocab_to_id_table.lookup(
          tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"]))
      ids = sess.run(ids)
      np.testing.assert_array_equal(ids, [0, 1, 2, 3, 3])

      words = id_to_vocab_table.lookup(
          tf.convert_to_tensor(
              [0, 1, 2, 3], dtype=tf.int64))
      words = sess.run(words)
      np.testing.assert_array_equal(
          np.char.decode(words.astype("S"), "utf-8"),
          ["Hello", ".", "笑", "UNK"])

      counts = word_to_count_table.lookup(
          tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"]))
      counts = sess.run(counts)
      np.testing.assert_array_equal(counts, [100, 200, 300, -1, -1])
Ejemplo n.º 3
0
    def _preprocess(self, features, labels):
        """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for target vocab
    - Converts tokens into vocabulary ids
    - Prepends a speical "SEQUENCE_START" token to the target
    - Appends a speical "SEQUENCE_END" token to the target
    """

        # Create vocabulary look for target
        target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

        # Add vocab tables to graph colection so that we can access them in
        # other places.
        graph_utils.add_dict_to_collection(
            {
                "target_vocab_to_id": target_vocab_to_id,
                "target_id_to_vocab": target_id_to_vocab,
                "target_word_to_count": target_word_to_count
            }, "vocab_tables")

        if labels is None:
            return features, None

        labels = labels.copy()

        # Slices targets to max length
        if self.params["target.max_seq_len"] is not None:
            labels["target_tokens"] = labels[
                "target_tokens"][:, :self.params["target.max_seq_len"]]
            labels["target_len"] = tf.minimum(
                labels["target_len"], self.params["target.max_seq_len"])

        # Look up the target ids in the vocabulary
        labels["target_ids"] = target_vocab_to_id.lookup(
            labels["target_tokens"])

        labels["target_len"] = tf.to_int32(labels["target_len"])
        tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        return features, labels
Ejemplo n.º 4
0
  def _preprocess(self, features, labels):
    """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for target vocab
    - Converts tokens into vocabulary ids
    - Prepends a speical "SEQUENCE_START" token to the target
    - Appends a speical "SEQUENCE_END" token to the target
    """

    # Create vocabulary look for target
    target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
      vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

    # Add vocab tables to graph colection so that we can access them in
    # other places.
    graph_utils.add_dict_to_collection({
        "target_vocab_to_id": target_vocab_to_id,
        "target_id_to_vocab": target_id_to_vocab,
        "target_word_to_count": target_word_to_count
    }, "vocab_tables")

    if labels is None:
      return features, None

    labels = labels.copy()

    # Slices targets to max length
    if self.params["target.max_seq_len"] is not None:
      labels["target_tokens"] = labels["target_tokens"][:, :self.params[
          "target.max_seq_len"]]
      labels["target_len"] = tf.minimum(labels["target_len"],
                                        self.params["target.max_seq_len"])

    # Look up the target ids in the vocabulary
    labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"])

    labels["target_len"] = tf.to_int32(labels["target_len"])
    tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

    # Add to graph collection for later use
    graph_utils.add_dict_to_collection(features, "features")
    if labels:
      graph_utils.add_dict_to_collection(labels, "labels")

    return features, labels
Ejemplo n.º 5
0
    def test_lookup_table(self):

        vocab_to_id_table, id_to_vocab_table, vocab_size = \
          vocab.create_vocabulary_lookup_table(self.vocab_file.name)

        self.assertEqual(vocab_size, 6)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            sess.run(tf.tables_initializer())

            ids = vocab_to_id_table.lookup(
                tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"]))
            ids = sess.run(ids)
            np.testing.assert_array_equal(ids, [0, 1, 2, 3, 3])

            words = id_to_vocab_table.lookup(
                tf.convert_to_tensor([0, 1, 2, 3], dtype=tf.int64))
            words = sess.run(words)
            np.testing.assert_array_equal(
                np.char.decode(words.astype("S"), "utf-8"),
                ["Hello", ".", "笑", "UNK"])
Ejemplo n.º 6
0
    def _preprocess(self, features, labels):
        """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for source and target vocab
    - Converts tokens into vocabulary ids
    """

        # Create vocabulary lookup for source
        source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

        # Create vocabulary look for target
        target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

        # Add vocab tables to graph colection so that we can access them in
        # other places.
        graph_utils.add_dict_to_collection(
            {
                "source_vocab_to_id": source_vocab_to_id,
                "source_id_to_vocab": source_id_to_vocab,
                "source_word_to_count": source_word_to_count,
                "target_vocab_to_id": target_vocab_to_id,
                "target_id_to_vocab": target_id_to_vocab,
                "target_word_to_count": target_word_to_count
            }, "vocab_tables")

        # Slice source to max_len
        if self.params["source.max_seq_len"] is not None:
            features["source_tokens"] = features[
                "source_tokens"][:, :self.params["source.max_seq_len"]]
            features["source_len"] = tf.minimum(
                features["source_len"], self.params["source.max_seq_len"])

        # Look up the source ids in the vocabulary
        features["source_ids"] = source_vocab_to_id.lookup(
            features["source_tokens"])

        # Maybe reverse the source
        if self.params["source.reverse"] is True:
            features["source_ids"] = tf.reverse_sequence(
                input=features["source_ids"],
                seq_lengths=features["source_len"],
                seq_dim=1,
                batch_dim=0,
                name=None)

        features["source_len"] = tf.to_int32(features["source_len"])
        tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

        if labels is None:
            return features, None

        labels = labels.copy()

        # Slices targets to max length
        if self.params["target.max_seq_len"] is not None:
            labels["target_tokens"] = labels[
                "target_tokens"][:, :self.params["target.max_seq_len"]]
            labels["target_len"] = tf.minimum(
                labels["target_len"], self.params["target.max_seq_len"])

        # Look up the target ids in the vocabulary
        labels["target_ids"] = target_vocab_to_id.lookup(
            labels["target_tokens"])

        labels["target_len"] = tf.to_int32(labels["target_len"])
        tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

        # Keep track of the number of processed tokens
        num_tokens = tf.reduce_sum(labels["target_len"])
        num_tokens += tf.reduce_sum(features["source_len"])
        token_counter_var = tf.Variable(0, "tokens_counter")
        total_tokens = tf.assign_add(token_counter_var, num_tokens)
        tf.summary.scalar("num_tokens", total_tokens)

        with tf.control_dependencies([total_tokens]):
            features["source_tokens"] = tf.identity(features["source_tokens"])

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        return features, labels
Ejemplo n.º 7
0
    def _preprocess(self, features, labels):
        """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for source and target vocab
    - Converts tokens into vocabulary ids
    - Appends a special "SEQUENCE_END" token to the source
    - Prepends a special "SEQUENCE_START" token to the target
    - Appends a special "SEQUENCE_END" token to the target
    """

        # Create vocabulary lookup for source
        source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

        # Create vocabulary look for target
        target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
          vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

        # Add vocab tables to graph colection so that we can access them in
        # other places.
        graph_utils.add_dict_to_collection(
            {
                "source_vocab_to_id": source_vocab_to_id,
                "source_id_to_vocab": source_id_to_vocab,
                "source_word_to_count": source_word_to_count,
                "target_vocab_to_id": target_vocab_to_id,
                "target_id_to_vocab": target_id_to_vocab,
                "target_word_to_count": target_word_to_count
            }, "vocab_tables")

        # Slice source to max_len
        if self.params["source.max_seq_len"] is not None:
            features["source_tokens"] = features[
                "source_tokens"][:, :self.params["source.max_seq_len"]]
            features["source_len"] = tf.minimum(
                features["source_len"], self.params["source.max_seq_len"])

        # Look up the source ids in the vocabulary
        features["source_ids"] = source_vocab_to_id.lookup(
            features["source_tokens"])

        # Maybe reverse the source
        if self.params["source.reverse"] is True:
            features["source_ids"] = tf.reverse_sequence(
                input=features["source_ids"],
                seq_lengths=features["source_len"],
                seq_dim=1,
                batch_dim=0,
                name=None)

        features["source_len"] = tf.to_int32(features["source_len"])
        tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

        if labels is None:
            return features, None

        labels = labels.copy()

        # Slices targets to max length
        if self.params["target.max_seq_len"] is not None:
            labels["target_tokens"] = labels[
                "target_tokens"][:, :self.params["target.max_seq_len"]]
            labels["target_len"] = tf.minimum(
                labels["target_len"], self.params["target.max_seq_len"])

        # Look up the target ids in the vocabulary
        labels["target_ids"] = target_vocab_to_id.lookup(
            labels["target_tokens"])

        labels["target_len"] = tf.to_int32(labels["target_len"])
        tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

        # Add to graph collection for later use
        graph_utils.add_dict_to_collection(features, "features")
        if labels:
            graph_utils.add_dict_to_collection(labels, "labels")

        return features, labels
Ejemplo n.º 8
0
  def _preprocess(self, features, labels):
    """Model-specific preprocessing for features and labels:

    - Creates vocabulary lookup tables for source and target vocab
    - Converts tokens into vocabulary ids
    """

    # Create vocabulary lookup for source
    source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \
      vocab.create_vocabulary_lookup_table(self.source_vocab_info.path)

    # Create vocabulary look for target
    target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \
      vocab.create_vocabulary_lookup_table(self.target_vocab_info.path)

    # Add vocab tables to graph colection so that we can access them in
    # other places.
    graph_utils.add_dict_to_collection({
        "source_vocab_to_id": source_vocab_to_id,
        "source_id_to_vocab": source_id_to_vocab,
        "source_word_to_count": source_word_to_count,
        "target_vocab_to_id": target_vocab_to_id,
        "target_id_to_vocab": target_id_to_vocab,
        "target_word_to_count": target_word_to_count
    }, "vocab_tables")

    # Slice source to max_len
    if self.params["source.max_seq_len"] is not None:
      features["source_tokens"] = features["source_tokens"][:, :self.params[
          "source.max_seq_len"]]
      features["source_len"] = tf.minimum(features["source_len"],
                                          self.params["source.max_seq_len"])

    # Look up the source ids in the vocabulary
    features["source_ids"] = source_vocab_to_id.lookup(features[
        "source_tokens"])

    # Maybe reverse the source
    if self.params["source.reverse"] is True:
      features["source_ids"] = tf.reverse_sequence(
          input=features["source_ids"],
          seq_lengths=features["source_len"],
          seq_dim=1,
          batch_dim=0,
          name=None)

    features["source_len"] = tf.to_int32(features["source_len"])
    tf.summary.histogram("source_len", tf.to_float(features["source_len"]))

    if labels is None:
      return features, None

    labels = labels.copy()

    # Slices targets to max length
    if self.params["target.max_seq_len"] is not None:
      labels["target_tokens"] = labels["target_tokens"][:, :self.params[
          "target.max_seq_len"]]
      labels["target_len"] = tf.minimum(labels["target_len"],
                                        self.params["target.max_seq_len"])

    # Look up the target ids in the vocabulary
    labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"])

    labels["target_len"] = tf.to_int32(labels["target_len"])
    tf.summary.histogram("target_len", tf.to_float(labels["target_len"]))

    # Keep track of the number of processed tokens
    num_tokens = tf.reduce_sum(labels["target_len"])
    num_tokens += tf.reduce_sum(features["source_len"])
    token_counter_var = tf.Variable(0, "tokens_counter")
    total_tokens = tf.assign_add(token_counter_var, num_tokens)
    tf.summary.scalar("num_tokens", total_tokens)

    with tf.control_dependencies([total_tokens]):
      features["source_tokens"] = tf.identity(features["source_tokens"])

    # Add to graph collection for later use
    graph_utils.add_dict_to_collection(features, "features")
    if labels:
      graph_utils.add_dict_to_collection(labels, "labels")

    return features, labels
Ejemplo n.º 9
0
def t(file_path, default_value=None):
    x = create_vocabulary_lookup_table(file_path)
    print(x)
    return x