Example #1
0
        def preproc_post(raw_post):
            # Split the input string, assuming that whitespace is splitter
            # The client should perform any required tokenization for us and join on ' '

            # WARNING: This can be a bug if the user defaults the values (-1)
            # for conll, the mxlen=124, for idr, the mxlen is forced to a max BPTT
            # for twpos, the mxlen=38
            # this should probably be fixed by serializing the mxlen of the model
            # or rereading it from the tensor from file
            mxlen = task.config_params['preproc']['mxlen']
            mxwlen = task.config_params['preproc']['mxwlen']

            #raw_post = tf.Print(raw_post, [raw_post])
            raw_tokens = tf.string_split(tf.reshape(raw_post, [-1])).values
            # sentence length <= mxlen
            nraw_post = tf.reduce_join(raw_tokens[:mxlen], separator=" ")
            # vocab has only lowercase words
            split_chars = tf.string_split(tf.reshape(nraw_post, [-1]),
                                          delimiter="").values
            upchar_inds = upchars_lut.lookup(split_chars)
            lc_raw_post = tf.reduce_join(
                tf.map_fn(lambda x: tf.cond(x[0] > 25, lambda: x[
                    1], lambda: lchars[x[0]]), (upchar_inds, split_chars),
                          dtype=tf.string))
            word_tokens = tf.string_split(tf.reshape(lc_raw_post, [-1]))

            # numchars per word should be <= mxwlen
            unchanged_word_tokens = tf.string_split(tf.reshape(
                nraw_post, [-1]))
            culled_word_token_vals = tf.substr(unchanged_word_tokens.values, 0,
                                               mxwlen)
            char_tokens = tf.string_split(culled_word_token_vals, delimiter='')
            word_indices = word2index.lookup(word_tokens)
            char_indices = char2index.lookup(char_tokens)

            # Reshape them out to the proper length
            reshaped_words = tf.sparse_reshape(word_indices, shape=[-1])
            sentence_length = tf.size(
                reshaped_words)  # tf.shape if 2 dims needed

            reshaped_words = tf.sparse_reset_shape(reshaped_words,
                                                   new_shape=[mxlen])
            reshaped_chars = tf.sparse_reset_shape(char_indices,
                                                   new_shape=[mxlen, mxwlen])

            # Now convert to a dense representation
            x = tf.sparse_tensor_to_dense(reshaped_words)
            x = tf.contrib.framework.with_shape([mxlen], x)
            xch = tf.sparse_tensor_to_dense(reshaped_chars)
            xch = tf.contrib.framework.with_shape([mxlen, mxwlen], xch)
            return x, xch, sentence_length
Example #2
0
def _split_string_to_fix_words(line, delimiter, max_words):
    words = tf.string_split(line, delimiter)

    fix_shape = [words.dense_shape[0], max_words]
    fix_words = tf.sparse_reset_shape(
        tf.sparse_slice(words, [0, 0], fix_shape), fix_shape)
    return fix_words
Example #3
0
    def _reshape_indices(self, indices, shape):
        reshaped = tf.sparse_reset_shape(indices, new_shape=shape)

        # Now convert to a dense representation
        x = tf.sparse_tensor_to_dense(reshaped)
        x = tf.contrib.framework.with_shape(shape, x)

        return x
Example #4
0
def decode(serialized_example):
    features = tf.parse_single_example(
        serialized_example,
        features={
            'image/data': tf.FixedLenFeature([], tf.string, default_value=''),
            'image/height': tf.FixedLenFeature([], tf.int64, default_value=0),
            'image/width': tf.FixedLenFeature([], tf.int64, default_value=0),
            'image/channel': tf.FixedLenFeature([], tf.int64, default_value=3),
            'image/name': tf.FixedLenFeature([], tf.string, default_value=''),
            'image/format': tf.FixedLenFeature([], tf.string,
                                               default_value=''),
            'label/top_left_height': tf.VarLenFeature(tf.int64),
            'label/top_left_width': tf.VarLenFeature(tf.int64),
            'label/top_right_height': tf.VarLenFeature(tf.int64),
            'label/top_right_width': tf.VarLenFeature(tf.int64),
            'label/bottom_left_height': tf.VarLenFeature(tf.int64),
            'label/bottom_left_width': tf.VarLenFeature(tf.int64),
            'label/bottom_right_height': tf.VarLenFeature(tf.int64),
            'label/bottom_right_width': tf.VarLenFeature(tf.int64),
        })

    image = tf.image.decode_jpeg(features['image/data'], channels=channel)

    top_left_height = features['label/top_left_height'] / height
    top_left_width = features['label/top_left_width'] / width
    top_right_height = features['label/top_right_height'] / height
    top_right_width = features['label/top_right_width'] / width
    bottom_left_height = features['label/bottom_left_height'] / height
    bottom_left_width = features['label/bottom_left_width'] / width
    bottom_right_height = features['label/bottom_right_height'] / height
    bottom_right_width = features['label/bottom_right_width'] / width

    coordinates = [
        top_left_height, top_left_width, top_right_height, top_right_width,
        bottom_left_height, bottom_left_width, bottom_right_height,
        bottom_right_width
    ]

    coordinates = [
        tf.sparse_reset_shape(x, new_shape=(2, )) for x in coordinates
    ]
    coordinates = [
        tf.sparse_tensor_to_dense(x, default_value=0) for x in coordinates
    ]

    coordinates = tf.stack(coordinates, axis=1)
    coordinates = tf.cast(coordinates, tf.float32)
    coordinates = tf.reshape(coordinates, [-1])

    return image, coordinates
Example #5
0
    def module_fn_with_preprocessing():
        """Spec function for a full-text embedding module with preprocessing."""
        sentences = tf.placeholder(shape=[None],
                                   dtype=tf.string,
                                   name="sentences")
        # Perform a minimalistic text preprocessing by removing punctuation and
        # splitting on spaces.
        normalized_sentences = tf.regex_replace(input=sentences,
                                                pattern=r"\pP",
                                                rewrite="")
        tokens = tf.string_split(normalized_sentences, " ")

        embeddings_var = tf.get_variable(initializer=tf.zeros(
            [vocab_size + num_oov_buckets, embeddings_dim]),
                                         name=EMBEDDINGS_VAR_NAME,
                                         dtype=tf.float32)
        table_initializer = tf.lookup.TextFileInitializer(
            vocabulary_file, tf.string, tf.lookup.TextFileIndex.WHOLE_LINE,
            tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER)
        lookup_table = tf.lookup.StaticVocabularyTable(
            table_initializer, num_oov_buckets=num_oov_buckets)
        sparse_ids = tf.SparseTensor(indices=tokens.indices,
                                     values=lookup_table.lookup(tokens.values),
                                     dense_shape=tokens.dense_shape)

        # In case some of the input sentences are empty before or after
        # normalization, we will end up with empty rows. We do however want to
        # return embedding for every row, so we have to fill in the empty rows with
        # a default.
        sparse_ids, _ = tf.sparse_fill_empty_rows(
            sparse_ids, lookup_table.lookup(tf.constant("")))
        # In case all of the input sentences are empty before or after
        # normalization, we will end up with a SparseTensor with shape [?, 0]. After
        # filling in the empty rows we must ensure the shape is set properly to
        # [?, 1]. At this point, there are no empty rows, so the new shape will be
        # [sparse_ids.dense_shape[0], max(1, sparse_ids.dense_shape[1])].
        sparse_ids = tf.sparse_reset_shape(sparse_ids)

        combined_embedding = tf.nn.embedding_lookup_sparse(
            params=embeddings_var,
            sp_ids=sparse_ids,
            sp_weights=None,
            combiner="sqrtn")

        hub.add_signature("default", {"sentences": sentences},
                          {"default": combined_embedding})
Example #6
0
        def preproc_post(raw_post):
            # raw_post is a "scalar string tensor"
            # (https://www.tensorflow.org/versions/r0.12/api_docs/python/image/encoding_and_decoding)
            # Split the input string, assuming that whitespace is splitter
            # The client should perform any required tokenization for us and join on ' '
            #raw_post = tf.Print(raw_post, [raw_post])
            mxlen = self.task.config_params['preproc']['mxlen']
            raw_tokens = tf.string_split(tf.reshape(raw_post, [-1])).values
            npost = tf.reduce_join(raw_tokens[:mxlen], separator=" ")
            tokens = tf.string_split(tf.reshape(npost, [-1]))
            # Convert the string values to word indices (ints)
            indices = word2index.lookup(tokens)

            # Reshape them out to the proper length
            reshaped = tf.sparse_reshape(indices, shape=[-1])
            reshaped = tf.sparse_reset_shape(reshaped, new_shape=[mxlen])

            # Now convert to a dense representation
            dense = tf.sparse_tensor_to_dense(reshaped)
            return dense
Example #7
0
File: export.py Project: jankim/hub
  def module_fn_with_preprocessing():
    """Spec function for a full-text embedding module with preprocessing."""
    sentences = tf.placeholder(shape=[None], dtype=tf.string, name="sentences")
    # Perform a minimalistic text preprocessing by removing punctuation and
    # splitting on spaces.
    normalized_sentences = tf.regex_replace(
        input=sentences, pattern=r"\pP", rewrite="")
    tokens = tf.string_split(normalized_sentences, " ")

    # In case some of the input sentences are empty before or after
    # normalization, we will end up with empty rows. We do however want to
    # return embedding for every row, so we have to fill in the empty rows with
    # a default.
    tokens, _ = tf.sparse_fill_empty_rows(tokens, "")
    # In case all of the input sentences are empty before or after
    # normalization, we will end up with a SparseTensor with shape [?, 0]. After
    # filling in the empty rows we must ensure the shape is set properly to
    # [?, 1].
    tokens = tf.sparse_reset_shape(tokens)

    embeddings_var = tf.get_variable(
        initializer=tf.zeros([vocab_size + num_oov_buckets, embeddings_dim]),
        name=EMBEDDINGS_VAR_NAME,
        dtype=tf.float32)
    lookup_table = tf.contrib.lookup.index_table_from_file(
        vocabulary_file=vocabulary_file,
        num_oov_buckets=num_oov_buckets,
    )
    sparse_ids = tf.SparseTensor(
        indices=tokens.indices,
        values=lookup_table.lookup(tokens.values),
        dense_shape=tokens.dense_shape)

    combined_embedding = tf.nn.embedding_lookup_sparse(
        params=embeddings_var,
        sp_ids=sparse_ids,
        sp_weights=None,
        combiner="sqrtn")

    hub.add_signature("default", {"sentences": sentences},
                      {"default": combined_embedding})
Example #8
0
 def reshape_indices(indices, shape):
     reshaped = tf.sparse_reset_shape(indices, new_shape=shape)
     # Now convert to a dense representation
     x = tf.sparse_tensor_to_dense(reshaped)
     return x
Example #9
0
 def reshape_indices(indices, shape):
     reshaped = tf.sparse_reset_shape(indices, new_shape=shape)
     # Now convert to a dense representation
     x = tf.sparse_tensor_to_dense(reshaped)
     return x