def preproc_post(raw_post): # Split the input string, assuming that whitespace is splitter # The client should perform any required tokenization for us and join on ' ' # WARNING: This can be a bug if the user defaults the values (-1) # for conll, the mxlen=124, for idr, the mxlen is forced to a max BPTT # for twpos, the mxlen=38 # this should probably be fixed by serializing the mxlen of the model # or rereading it from the tensor from file mxlen = task.config_params['preproc']['mxlen'] mxwlen = task.config_params['preproc']['mxwlen'] #raw_post = tf.Print(raw_post, [raw_post]) raw_tokens = tf.string_split(tf.reshape(raw_post, [-1])).values # sentence length <= mxlen nraw_post = tf.reduce_join(raw_tokens[:mxlen], separator=" ") # vocab has only lowercase words split_chars = tf.string_split(tf.reshape(nraw_post, [-1]), delimiter="").values upchar_inds = upchars_lut.lookup(split_chars) lc_raw_post = tf.reduce_join( tf.map_fn(lambda x: tf.cond(x[0] > 25, lambda: x[ 1], lambda: lchars[x[0]]), (upchar_inds, split_chars), dtype=tf.string)) word_tokens = tf.string_split(tf.reshape(lc_raw_post, [-1])) # numchars per word should be <= mxwlen unchanged_word_tokens = tf.string_split(tf.reshape( nraw_post, [-1])) culled_word_token_vals = tf.substr(unchanged_word_tokens.values, 0, mxwlen) char_tokens = tf.string_split(culled_word_token_vals, delimiter='') word_indices = word2index.lookup(word_tokens) char_indices = char2index.lookup(char_tokens) # Reshape them out to the proper length reshaped_words = tf.sparse_reshape(word_indices, shape=[-1]) sentence_length = tf.size( reshaped_words) # tf.shape if 2 dims needed reshaped_words = tf.sparse_reset_shape(reshaped_words, new_shape=[mxlen]) reshaped_chars = tf.sparse_reset_shape(char_indices, new_shape=[mxlen, mxwlen]) # Now convert to a dense representation x = tf.sparse_tensor_to_dense(reshaped_words) x = tf.contrib.framework.with_shape([mxlen], x) xch = tf.sparse_tensor_to_dense(reshaped_chars) xch = tf.contrib.framework.with_shape([mxlen, mxwlen], xch) return x, xch, sentence_length
def _split_string_to_fix_words(line, delimiter, max_words): words = tf.string_split(line, delimiter) fix_shape = [words.dense_shape[0], max_words] fix_words = tf.sparse_reset_shape( tf.sparse_slice(words, [0, 0], fix_shape), fix_shape) return fix_words
def _reshape_indices(self, indices, shape): reshaped = tf.sparse_reset_shape(indices, new_shape=shape) # Now convert to a dense representation x = tf.sparse_tensor_to_dense(reshaped) x = tf.contrib.framework.with_shape(shape, x) return x
def decode(serialized_example): features = tf.parse_single_example( serialized_example, features={ 'image/data': tf.FixedLenFeature([], tf.string, default_value=''), 'image/height': tf.FixedLenFeature([], tf.int64, default_value=0), 'image/width': tf.FixedLenFeature([], tf.int64, default_value=0), 'image/channel': tf.FixedLenFeature([], tf.int64, default_value=3), 'image/name': tf.FixedLenFeature([], tf.string, default_value=''), 'image/format': tf.FixedLenFeature([], tf.string, default_value=''), 'label/top_left_height': tf.VarLenFeature(tf.int64), 'label/top_left_width': tf.VarLenFeature(tf.int64), 'label/top_right_height': tf.VarLenFeature(tf.int64), 'label/top_right_width': tf.VarLenFeature(tf.int64), 'label/bottom_left_height': tf.VarLenFeature(tf.int64), 'label/bottom_left_width': tf.VarLenFeature(tf.int64), 'label/bottom_right_height': tf.VarLenFeature(tf.int64), 'label/bottom_right_width': tf.VarLenFeature(tf.int64), }) image = tf.image.decode_jpeg(features['image/data'], channels=channel) top_left_height = features['label/top_left_height'] / height top_left_width = features['label/top_left_width'] / width top_right_height = features['label/top_right_height'] / height top_right_width = features['label/top_right_width'] / width bottom_left_height = features['label/bottom_left_height'] / height bottom_left_width = features['label/bottom_left_width'] / width bottom_right_height = features['label/bottom_right_height'] / height bottom_right_width = features['label/bottom_right_width'] / width coordinates = [ top_left_height, top_left_width, top_right_height, top_right_width, bottom_left_height, bottom_left_width, bottom_right_height, bottom_right_width ] coordinates = [ tf.sparse_reset_shape(x, new_shape=(2, )) for x in coordinates ] coordinates = [ tf.sparse_tensor_to_dense(x, default_value=0) for x in coordinates ] coordinates = tf.stack(coordinates, axis=1) coordinates = tf.cast(coordinates, tf.float32) coordinates = tf.reshape(coordinates, [-1]) return image, coordinates
def module_fn_with_preprocessing(): """Spec function for a full-text embedding module with preprocessing.""" sentences = tf.placeholder(shape=[None], dtype=tf.string, name="sentences") # Perform a minimalistic text preprocessing by removing punctuation and # splitting on spaces. normalized_sentences = tf.regex_replace(input=sentences, pattern=r"\pP", rewrite="") tokens = tf.string_split(normalized_sentences, " ") embeddings_var = tf.get_variable(initializer=tf.zeros( [vocab_size + num_oov_buckets, embeddings_dim]), name=EMBEDDINGS_VAR_NAME, dtype=tf.float32) table_initializer = tf.lookup.TextFileInitializer( vocabulary_file, tf.string, tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER) lookup_table = tf.lookup.StaticVocabularyTable( table_initializer, num_oov_buckets=num_oov_buckets) sparse_ids = tf.SparseTensor(indices=tokens.indices, values=lookup_table.lookup(tokens.values), dense_shape=tokens.dense_shape) # In case some of the input sentences are empty before or after # normalization, we will end up with empty rows. We do however want to # return embedding for every row, so we have to fill in the empty rows with # a default. sparse_ids, _ = tf.sparse_fill_empty_rows( sparse_ids, lookup_table.lookup(tf.constant(""))) # In case all of the input sentences are empty before or after # normalization, we will end up with a SparseTensor with shape [?, 0]. After # filling in the empty rows we must ensure the shape is set properly to # [?, 1]. At this point, there are no empty rows, so the new shape will be # [sparse_ids.dense_shape[0], max(1, sparse_ids.dense_shape[1])]. sparse_ids = tf.sparse_reset_shape(sparse_ids) combined_embedding = tf.nn.embedding_lookup_sparse( params=embeddings_var, sp_ids=sparse_ids, sp_weights=None, combiner="sqrtn") hub.add_signature("default", {"sentences": sentences}, {"default": combined_embedding})
def preproc_post(raw_post): # raw_post is a "scalar string tensor" # (https://www.tensorflow.org/versions/r0.12/api_docs/python/image/encoding_and_decoding) # Split the input string, assuming that whitespace is splitter # The client should perform any required tokenization for us and join on ' ' #raw_post = tf.Print(raw_post, [raw_post]) mxlen = self.task.config_params['preproc']['mxlen'] raw_tokens = tf.string_split(tf.reshape(raw_post, [-1])).values npost = tf.reduce_join(raw_tokens[:mxlen], separator=" ") tokens = tf.string_split(tf.reshape(npost, [-1])) # Convert the string values to word indices (ints) indices = word2index.lookup(tokens) # Reshape them out to the proper length reshaped = tf.sparse_reshape(indices, shape=[-1]) reshaped = tf.sparse_reset_shape(reshaped, new_shape=[mxlen]) # Now convert to a dense representation dense = tf.sparse_tensor_to_dense(reshaped) return dense
def module_fn_with_preprocessing(): """Spec function for a full-text embedding module with preprocessing.""" sentences = tf.placeholder(shape=[None], dtype=tf.string, name="sentences") # Perform a minimalistic text preprocessing by removing punctuation and # splitting on spaces. normalized_sentences = tf.regex_replace( input=sentences, pattern=r"\pP", rewrite="") tokens = tf.string_split(normalized_sentences, " ") # In case some of the input sentences are empty before or after # normalization, we will end up with empty rows. We do however want to # return embedding for every row, so we have to fill in the empty rows with # a default. tokens, _ = tf.sparse_fill_empty_rows(tokens, "") # In case all of the input sentences are empty before or after # normalization, we will end up with a SparseTensor with shape [?, 0]. After # filling in the empty rows we must ensure the shape is set properly to # [?, 1]. tokens = tf.sparse_reset_shape(tokens) embeddings_var = tf.get_variable( initializer=tf.zeros([vocab_size + num_oov_buckets, embeddings_dim]), name=EMBEDDINGS_VAR_NAME, dtype=tf.float32) lookup_table = tf.contrib.lookup.index_table_from_file( vocabulary_file=vocabulary_file, num_oov_buckets=num_oov_buckets, ) sparse_ids = tf.SparseTensor( indices=tokens.indices, values=lookup_table.lookup(tokens.values), dense_shape=tokens.dense_shape) combined_embedding = tf.nn.embedding_lookup_sparse( params=embeddings_var, sp_ids=sparse_ids, sp_weights=None, combiner="sqrtn") hub.add_signature("default", {"sentences": sentences}, {"default": combined_embedding})
def reshape_indices(indices, shape): reshaped = tf.sparse_reset_shape(indices, new_shape=shape) # Now convert to a dense representation x = tf.sparse_tensor_to_dense(reshaped) return x