def from_tokens(raw, lookup_): gathered = tf.gather(lookup_, tf.cast(raw, tf.int32)) joined = tf.regex_replace(tf.reduce_join(gathered, axis=1), b"<EOS>.*", b"") cleaned = tf.regex_replace(joined, b"_", b" ") tokens = tf.string_split(cleaned, " ") return tokens
def _bpe2word_with_pad(tensor): tensor = tf.regex_replace(tensor, "<pad>", "▁⍷") joined_tensor = tf.reduce_join(tensor) replaced_tensor = tf.regex_replace(joined_tensor, "▁", " ") word_tensor = tf.string_split([replaced_tensor]).values pad = tf.tile(tf.constant(["⍷"]), [MAX_WORD_LEN - tf.shape(word_tensor)[0]]) return tf.concat([word_tensor, pad], axis=-1)
def _ComputeDecoderMetrics(self, decoder_outs, input_batch): """Computes metrics on output from decoder. Args: decoder_outs: A `BeamSearchDecodeOutput`, a namedtuple containing the decode results. input_batch: A `NestedMap` of tensors representing the source, target, and other components of the input batch. Returns: A dict of Tensors containing decoder output and metrics. """ p = self.params topk = self._GetTopK(decoder_outs) utt_ids = input_batch.sample_ids tgt = input_batch.tgt if p.target_key: tgt = input_batch.additional_tgts[p.target_key] transcripts = self.input_generator.IdsToStrings( tgt.labels, tf.cast(tf.reduce_sum(1.0 - tgt.paddings, 1) - 1.0, tf.int32)) # Filter out all isolated '<noise>' tokens. noise_pattern = ' <noise> |^<noise> | <noise>$|^<noise>$' filtered_refs = tf.regex_replace(transcripts, noise_pattern, ' ') filtered_hyps = tf.regex_replace(topk.decoded, noise_pattern, ' ') # Compute translation quality scores for all hyps. filtered_refs = tf.tile(tf.reshape(filtered_refs, [-1, 1]), [1, p.decoder.beam_search.num_hyps_per_beam]) filtered_hyps = tf.reshape(filtered_hyps, [-1]) filtered_refs = tf.reshape(filtered_refs, [-1]) norm_wer_errors, norm_wer_words = self._ComputeNormalizedWER( filtered_hyps, filtered_refs) ret_dict = { 'target_ids': tgt.ids, 'target_labels': tgt.labels, 'target_weights': tgt.weights, 'target_paddings': tgt.paddings, 'utt_id': utt_ids, 'transcripts': transcripts, 'topk_decoded': topk.decoded, 'topk_ids': topk.ids, 'topk_lens': topk.lens, 'topk_scores': topk.scores, 'norm_wer_errors': norm_wer_errors, 'norm_wer_words': norm_wer_words, } ret_dict.update( self.AddAdditionalDecoderMetricsToGraph(topk, filtered_hyps, filtered_refs, input_batch, decoder_outs)) return ret_dict
def Decode(self): """Constructs the inference graph.""" p = self.params with tf.name_scope('fprop'), tf.name_scope(p.name): batch = self.input_generator.GetPreprocessedInputBatch() src_enc, src_enc_padding, _ = self.encoder.FPropDefaultTheta( batch.src) if hasattr(self.decoder, 'contextualizer'): self.decoder.contextualizer.SetContextMap(batch.tgt) decoder_outs = self.decoder.BeamSearchDecode( src_enc, src_enc_padding) topk = self._GetTopK(decoder_outs) utt_ids = batch.sample_ids tgt = batch.tgt if p.target_key: tgt = batch.additional_tgts[p.target_key] transcripts = self.input_generator.IdsToStrings( tgt.labels, tf.cast(tf.reduce_sum(1.0 - tgt.paddings, 1) - 1.0, tf.int32)) # Filter out all isolated '<noise>' tokens. noise_pattern = ' <noise> |^<noise> | <noise>$|^<noise>$' filtered_refs = tf.regex_replace(transcripts, noise_pattern, ' ') filtered_hyps = tf.regex_replace(topk.decoded, noise_pattern, ' ') # Compute translation quality scores for all hyps. filtered_refs = tf.tile( tf.reshape(filtered_refs, [-1, 1]), [1, p.decoder.beam_search.num_hyps_per_beam]) filtered_hyps = tf.reshape(filtered_hyps, [-1]) filtered_refs = tf.reshape(filtered_refs, [-1]) norm_wer_errors, norm_wer_words = self._ComputeNormalizedWER( filtered_hyps, filtered_refs) ret_dict = { 'target_ids': tgt.ids, 'target_labels': tgt.labels, 'target_weights': tgt.weights, 'target_paddings': tgt.paddings, 'utt_id': utt_ids, 'transcripts': transcripts, 'topk_decoded': topk.decoded, 'topk_ids': topk.ids, 'topk_lens': topk.lens, 'topk_scores': topk.scores, 'norm_wer_errors': norm_wer_errors, 'norm_wer_words': norm_wer_words, } ret_dict.update( self.AddAdditionalDecoderMetricsToGraph( topk, filtered_hyps, filtered_refs)) return ret_dict
def load_img_cuhk(image_file, img_width, img_height, is_train): input_image = tf.read_file(image_file) input_image = tf.image.decode_jpeg(input_image, channels=3) target_img_file = tf.regex_replace(image_file, "sketches", "photos") target_img_file = tf.regex_replace(target_img_file, "-sz1", "") target_img_file = tf.regex_replace(target_img_file, "F2-", "f-") target_img_file = tf.regex_replace(target_img_file, "M2-", "m-") real_image = tf.read_file(target_img_file) real_image = tf.image.decode_jpeg(real_image) return _load_image(input_image, real_image, img_width, img_height, is_train)
def parse_example(parsed_features): label = parsed_features['comb/label'] ins = parsed_features['example/input'] inst1 = tf.regex_replace(parsed_features['comb/inst1'], ' ', '_') inst2 = tf.regex_replace(parsed_features['comb/inst2'], ' ', '_') type1 = parsed_features['comb/type1'] type2 = parsed_features['comb/type2'] file1 = parsed_features['comb/file1'] file2 = parsed_features['comb/file2'] genre = parsed_features['comb/genre'] id = parsed_features['comb/id'] return ins, label, tf.string_join([type1, ' x ', type2]), tf.string_join( [inst1, ' x ', inst2]), genre, id, tf.string_join([file1, ' x ', file2])
def decode(line): fields = tf.string_split([line], self.field_delim).values if self.index: # Skip index fields = fields[1:] fields = tf.regex_replace(fields, '|'.join(self.na_values), 'nan') fields = tf.string_to_number(fields, tf.float32) return fields
def load_img_celeba(image_file, img_width, img_height, is_train): input_image = tf.read_file(image_file) input_image = tf.image.decode_jpeg(input_image, channels=3) target_img_file = tf.regex_replace(image_file, "landmarks", "photos") real_image = tf.read_file(target_img_file) real_image = tf.image.decode_jpeg(real_image) return _load_image(input_image, real_image, img_width, img_height, is_train)
def serving_input_fn(): print('new serving_input_fn') # define placeholder for filename filename = tf.placeholder(dtype=tf.string) # TODO : make it batch-compatible (with Dataset or string input producer) if not channel_ids: decoded_image = tf.to_float( tf.image.decode_jpeg(tf.read_file(filename), channels=3, try_recover_truncated=True)) else: first_channel = True if len(channel_ids) == 1: channelNum = 3 else: channelNum = 1 for id in channel_ids: channelname = tf.regex_replace(filename, '.png', separator + id + '.png') decoded_channel = tf.to_float( tf.image.decode_jpeg(tf.read_file(channelname), channels=channelNum, try_recover_truncated=True)) if first_channel: decoded_image = decoded_channel first_channel = False else: decoded_image = tf.concat([decoded_image, decoded_channel], 2) original_shape = tf.shape(decoded_image)[:2] if resized_size is not None and resized_size > 0: image = resize_image(decoded_image, resized_size) else: image = decoded_image image_batch = image[None] features = {'images': image_batch, 'original_shape': original_shape} receiver_inputs = {'filename': filename} input_from_resized_images = {'resized_images': image_batch} input_from_original_image = {'image': decoded_image} return tf.estimator.export.ServingInputReceiver( features, receiver_inputs, receiver_tensors_alternatives={ 'from_image': input_from_original_image, 'from_resized_images': input_from_resized_images })
def from_characters(raw, lookup_): """Convert ascii+2 encoded codes to string-tokens.""" corrected = tf.bitcast( tf.clip_by_value(tf.subtract(raw, 2), 0, 255), tf.uint8) gathered = tf.gather(lookup_, tf.cast(corrected, tf.int32))[:, :, 0] joined = tf.reduce_join(gathered, axis=1) cleaned = tf.regex_replace(joined, b"\0", b"") tokens = tf.string_split(cleaned, " ") return tokens
def load_img_to_tensor(dict_type_to_imagepath): dict_res = {} for str_type, str_filepath in dict_type_to_imagepath.items(): if str_type == 'labelM': try: kittipath = '/notebooks/dataset' #kittipath = os.environ['KITTIPATH'] str_filepath = tf.regex_replace(str_filepath, tf.constant('\$KITTIPATH'), tf.constant(kittipath)) except Exception: print( "WARNING: KITTIPATH not defined - this may result in errors!" ) tf_filepath = tf.read_file(str_filepath) tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint8) tf_tensor = tf.cast(tf_tensor, dtype=tf.float32) tf_tensor = tf.image.resize_image_with_crop_or_pad( tf_tensor, 352, 1216) dict_res[str_type] = tf_tensor else: try: kittipath = '/notebooks/dataset/' #kittipath = os.environ['KITTIPATH'] str_filepath = tf.regex_replace(str_filepath, tf.constant('\$KITTIPATH'), tf.constant(kittipath)) except Exception: print( "WARNING: KITTIPATH not defined - this may result in errors!" ) tf_filepath = tf.read_file(str_filepath) tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint16) tf_tensor = tf.cast(tf_tensor, dtype=tf.int32) tf_tensor = tf.image.resize_image_with_crop_or_pad( tf_tensor, 352, 1216) dict_res[str_type] = tf_tensor return dict_res
def parse_text_line(line, path): split = tf.string_split([line]) image_filename = split.values[0] label_filename = split.values[1] pattern = "\/SegNet" image_filename = tf.regex_replace(image_filename, pattern, path) label_filename = tf.regex_replace(label_filename, pattern, path) image = get_image_tensor(image_filename, channels=3) label = get_image_tensor(label_filename, channels=1) image = tf.cast(image, tf.float32) label = tf.cast(label, tf.int32) image = image / 255. target_height = FLAGS.height target_width = FLAGS.width resized_image, resized_label = randomly_scale( image, label, [target_height, target_width]) resized_label = replace_ignore_label(resized_label) return {'image': resized_image}, resized_label
def _ComputeNormalizedWER(self, hyps, refs): # Filter out all '<epsilon>' tokens for norm_wer computation. hyps_no_epsilon = tf.regex_replace(hyps, '(<epsilon>)+', ' ') # norm_wer is size [num_transcripts * hyps_per_beam, 2] norm_wer = decoder_utils.ComputeWer(hyps_no_epsilon, refs) # Split into two tensors of size [num_transcripts * hyps_per_beam, 1] norm_wer_errors, norm_wer_words = tf.split(norm_wer, [1, 1], 1) shape = [-1, self.params.decoder.beam_search.num_hyps_per_beam] norm_wer_errors = tf.reshape(norm_wer_errors, shape) norm_wer_words = tf.reshape(norm_wer_words, shape) return norm_wer_errors, norm_wer_words
def load_images(self, filename): image_string = tf.read_file(filename) image_decoded = tf.image.decode_png(image_string) image_resized = tf.image.resize_images(image_decoded, [self.img_size, self.img_size]) mask_string = tf.read_file(tf.regex_replace(filename, "rgb", "mask")) mask_decoded = tf.image.decode_png(mask_string) mask_resized = tf.image.resize_images(mask_decoded, [self.img_size, self.img_size]) mask_gray = tf.image.rgb_to_grayscale(mask_resized) depth_string = tf.regex_replace(filename, "rgb", "depth") depth_string = tf.regex_replace(depth_string, "shapenet", "shapenet_depth") depth_string = tf.read_file(depth_string) depth_decoded = tf.image.decode_png(depth_string) depth_resized = tf.image.resize_images(depth_decoded, [self.img_size, self.img_size]) depth_gray = tf.image.rgb_to_grayscale(depth_resized) return image_resized, mask_gray, depth_gray, filename
def load_and_resize_image(filename: str, channels: int, size: int = None, interpolation: str = 'BILINEAR', channel_ids: list = (), separator: str = []) -> tf.Tensor: """Loads an image from its filename and resizes it to the desired output size. :param filename: string tensor :param channels: number of channels for the decoded image :param size: number of desired pixels in the resized image, tf.Tensor or int (None for no resizing) :param interpolation: :param return_original_shape: returns the original shape of the image before resizing if this flag is True :return: decoded and resized float32 tensor [h, w, channels], """ with tf.name_scope('load_img'): if not channel_ids: decoded_image = tf.to_float( tf.image.decode_jpeg(tf.read_file(filename), channels=channels, try_recover_truncated=True)) else: first_channel = True if len(channel_ids) == 1: channelNum = 3 else: channelNum = 1 for id in channel_ids: channelname = tf.regex_replace(filename, '.png', separator + id + '.png') decoded_channel = tf.to_float( tf.image.decode_jpeg(tf.read_file(channelname), channels=channelNum, try_recover_truncated=True)) if first_channel: decoded_image = decoded_channel first_channel = False else: decoded_image = tf.concat([decoded_image, decoded_channel], 2) # TODO : if one side is smaller than size of patches (and make patches == true), # TODO : force the image to have at least patch size if size is not None and not (isinstance(size, int) and size <= 0): result_image = resize_image(decoded_image, size, interpolation) else: result_image = decoded_image return result_image
def get_usr_fields(hparams): """ Each user field has a placeholder. The regex is to add whitespace on both sides of punctuations. :param hparams: hparams :return: """ usr_text_placeholders = [] usr_fields = [] tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK) for ftr_name in hparams.feature_names: if ftr_name.startswith('usr_'): # If hparams.add_first_dim_for_usr_placeholder is True, the usr placeholders have dimension [None] # This is to use the usr field features as document features in model serving if hparams.add_first_dim_for_usr_placeholder: # each user field is a placeholder (one string) placeholder = tf.placeholder(shape=[None], dtype=tf.string, name=ftr_name + "_placeholder") else: placeholder = tf.placeholder(shape=[], dtype=tf.string, name=ftr_name + "_placeholder") usr_text_placeholders.append(placeholder) one_usr_field = placeholder # add whitespace on both sides of punctuations if regex pattern is not None if hparams.regex_replace_pattern is not None: one_usr_field = tf.regex_replace( input=one_usr_field, pattern=hparams.regex_replace_pattern, rewrite=" \\1 ") # remove added dimension if hparams.add_first_dim_for_usr_placeholder: one_usr_field = tf.squeeze(one_usr_field, [0]) one_usr_field = tf.expand_dims(one_usr_field, axis=0) one_usr_field = data_fn.process_text( one_usr_field, tf_vocab_table, hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0) usr_fields.append(one_usr_field) return usr_fields, usr_text_placeholders
def module_fn_with_preprocessing(): """Spec function for a full-text embedding module with preprocessing.""" sentences = tf.placeholder(shape=[None], dtype=tf.string, name="sentences") # Perform a minimalistic text preprocessing by removing punctuation and # splitting on spaces. normalized_sentences = tf.regex_replace(input=sentences, pattern=r"\pP", rewrite="") tokens = tf.string_split(normalized_sentences, " ") embeddings_var = tf.get_variable(initializer=tf.zeros( [vocab_size + num_oov_buckets, embeddings_dim]), name=EMBEDDINGS_VAR_NAME, dtype=tf.float32) table_initializer = tf.lookup.TextFileInitializer( vocabulary_file, tf.string, tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER) lookup_table = tf.lookup.StaticVocabularyTable( table_initializer, num_oov_buckets=num_oov_buckets) sparse_ids = tf.SparseTensor(indices=tokens.indices, values=lookup_table.lookup(tokens.values), dense_shape=tokens.dense_shape) # In case some of the input sentences are empty before or after # normalization, we will end up with empty rows. We do however want to # return embedding for every row, so we have to fill in the empty rows with # a default. sparse_ids, _ = tf.sparse_fill_empty_rows( sparse_ids, lookup_table.lookup(tf.constant(""))) # In case all of the input sentences are empty before or after # normalization, we will end up with a SparseTensor with shape [?, 0]. After # filling in the empty rows we must ensure the shape is set properly to # [?, 1]. At this point, there are no empty rows, so the new shape will be # [sparse_ids.dense_shape[0], max(1, sparse_ids.dense_shape[1])]. sparse_ids = tf.sparse_reset_shape(sparse_ids) combined_embedding = tf.nn.embedding_lookup_sparse( params=embeddings_var, sp_ids=sparse_ids, sp_weights=None, combiner="sqrtn") hub.add_signature("default", {"sentences": sentences}, {"default": combined_embedding})
def load_img_to_tensor(dict_type_to_imagepath): dict_res = {} for str_type, str_filepath in dict_type_to_imagepath.items(): try: kittipath = os.environ['KITTIPATH'] str_filepath = tf.regex_replace(str_filepath, tf.constant('\$KITTIPATH'), tf.constant(kittipath)) except Exception: print( "WARNING: KITTIPATH not defined - this may result in errors!") tf_filepath = tf.read_file(str_filepath) tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint16) tf_tensor = tf.cast(tf_tensor, dtype=tf.int32) dict_res[str_type] = tf_tensor return dict_res
def load_img_to_tensor(self, dict_type_to_imagepath): dict_res = {} for str_type, str_filepath in dict_type_to_imagepath.items(): try: kittipath = os.environ['KITTIPATH'] str_filepath = tf.regex_replace(str_filepath, tf.constant( '\$KITTIPATH'), tf.constant(kittipath)) except Exception: print("WARNING: KITTIPATH not defined - this may result in errors!") tf_filepath = tf.read_file(str_filepath) tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint16) tf_tensor = tf.image.resize_image_with_crop_or_pad(tf_tensor, self.parameters.image_size[0], self.parameters.image_size[1]) tf_tensor = tf.cast(tf_tensor, dtype=tf.float32) tf_tensor = tf.divide(tf_tensor, 256.0) dict_res[str_type] = tf_tensor return dict_res
def vectorize_smile(data_dict, vocab, data_hparams): """Vectorize the SMILEs and generate the sequence inputs and labels.""" # Fix the GO symbol and shift the seq_label. smile = data_dict["smile"] tokenizer = lambda x: true_smile_tokenizer( x, skip_at_symbol=data_hparams.skip_at_symbol) if data_hparams.skip_at_symbol: smile = tf.regex_replace(smile, "@", "") def py_func_tokenize_smile(smi): """Return a py_func for tokenizing SMILE string in tf tensors.""" # Extract token nums tokens = sentence_to_token_ids(smi, vocabulary=vocab, tokenizer=tokenizer) tokens = np.array(tokens, dtype=np.int32) # truncate if needed. if len(tokens) > (data_hparams.max_seq_len - 1): # Truncate the sequence with a space for EOS_ID tokens = tokens[:(data_hparams.max_seq_len - 1)] return tokens # Raw encode of the SMILEs. tokens = tf.py_func(py_func_tokenize_smile, [smile], tf.int32) tokens.set_shape((None, )) seq_len = tf.shape(tokens)[0] + 1 # Save the seq_labels. [seq_length] seq_labels = tf.concat( [tokens, tf.constant([vocab.EOS_ID], dtype=tokens.dtype)], -1) # Produce inputs. seq_inputs = tf.concat( [tf.constant([vocab.GO_ID], dtype=tokens.dtype), tokens], -1) # One-hot each vector. -> [? (seq_length), TOK_DIM] seq_inputs = tf.one_hot(seq_inputs, len(vocab), dtype=tf.float32) # One-hot encoder inputs. -> [? (seq_length), TOK_DIM] encoder_inputs = tf.one_hot(tokens, len(vocab), dtype=tf.float32) return { "smile": smile, "decoder_lens": seq_len, "decoder_inputs": seq_inputs, "decoder_labels": seq_labels, "encoder_inputs": encoder_inputs }
def preprocessing_fn(inputs): """ Preprocess input columns into transformed columns. Args: inputs (dict): dict of input columns Returns: output dict of transformed columns """ outputs = {} # Encode categorical column: outputs['MixingSpeed'] = tft.compute_and_apply_vocabulary( inputs['MixingSpeed']) outputs['ButterMass'] = inputs['ButterMass'] # Calculate Derived Features: outputs['TotalMass'] = inputs['ButterMass'] + inputs['SugarMass'] + inputs[ 'FlourMass'] for ingredient in ['Butter', 'Sugar', 'Flour']: ingredient_percentage = inputs['{}Mass'.format( ingredient)] / outputs['TotalMass'] outputs['Norm{}perc'.format(ingredient)] = tft.scale_to_z_score( ingredient_percentage) # Keep absolute numeric columns for key in ['TotalVolume', 'Energy']: outputs[key] = inputs[key] # Normalize other numeric columns for key in [ 'ButterTemperature', 'SugarHumidity', 'FlourHumidity', 'HeatingTime', 'MixingTime', 'Density', 'Temperature', 'Humidity', ]: outputs[key] = tft.scale_to_z_score(inputs[key]) # Extract Specific Problems chunks_detected_str = tf.regex_replace(input=inputs['Problems'], pattern='.*chunk.*', rewrite='chunk', name='DetectChunk') outputs['Chunks'] = tf.cast(tf.equal(chunks_detected_str, 'chunk'), tf.float32) return outputs
def add_id_lookups(self): table = lookup.index_table_from_tensor(mapping=tf.constant(['']), default_value=1) sentences_shape = tf.shape(self.padded_sentences, out_type=tf.int64) removed_char_sentences = remove_unknown_chars(self.padded_sentences, self.char_table) split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]), delimiter="") dense_split_words = tf.sparse_tensor_to_dense(split_words, default_value='') max_word_len = tf.gather_nd(split_words.dense_shape, tf.constant([1])) chars_shape = tf.concat([sentences_shape, [max_word_len]], 0) chars = tf.reshape(dense_split_words, chars_shape) self.word_lengths = tf.reduce_sum(table.lookup(chars), 2) lowercase_sentences = lowercase(self.padded_sentences) sanitised_sentences = tf.regex_replace(lowercase_sentences, '^[0-9]+$', NUM) self.sequence_lengths = tf.reduce_sum( table.lookup(sanitised_sentences), 1) self.word_ids = self.word_table.lookup(sanitised_sentences) self.char_ids = self.char_table.lookup(chars) word_mask = tf.sequence_mask(self.sequence_lengths) char_mask = tf.sequence_mask(self.word_lengths) self.word_ids = tf.where(word_mask, self.word_ids, tf.zeros_like(self.word_ids)) self.char_ids = tf.where(char_mask, self.char_ids, tf.zeros_like(self.char_ids)) label_lengths = tf.reduce_sum(table.lookup(self.label_codes), 1) labels_mask = tf.sequence_mask(label_lengths) self.labels = self.label_table.lookup(self.label_codes) self.labels = tf.where(labels_mask, self.labels, tf.zeros_like(self.labels))
def module_fn_with_preprocessing(): #支持全文本输入,带有预处理的模型 sentences = tf.placeholder(shape=[None], dtype=tf.string, name="sentences") #使用正则表达式,删除特殊符号 normalized_sentences = tf.regex_replace(input=sentences, pattern=r"\pP", rewrite="") #按照空格分词,得到稀疏矩阵 tokens = tf.string_split(normalized_sentences, " ") embeddings_var = tf.get_variable( #定义词嵌入变量 initializer=tf.zeros( [vocab_size + num_oov_buckets, embeddings_dim]), name='embedding', dtype=tf.float32) #用字典将词变为词向量 lookup_table = tf.contrib.lookup.index_table_from_file( vocabulary_file=vocabulary_file, num_oov_buckets=num_oov_buckets) #将稀疏矩阵用词嵌入转化 sparse_ids = tf.SparseTensor(indices=tokens.indices, values=lookup_table.lookup(tokens.values), dense_shape=tokens.dense_shape) #为稀疏矩阵添加空行 sparse_ids, _ = tf.sparse_fill_empty_rows( sparse_ids, lookup_table.lookup(tf.constant(""))) #sparse_ids = tf.sparse_reset_shape(sparse_ids) #结果进行平方和再开根号的规约计算 combined_embedding = tf.nn.embedding_lookup_sparse( params=embeddings_var, sp_ids=sparse_ids, sp_weights=None, combiner="sqrtn") #默认都统一使用default签名。如果额外指定,还需要在调用时与其对应 #输入和输出需要字典形式。可以是多个 hub.add_signature("default", {"sentences": sentences}, {"default": combined_embedding})
def vectorize_sentences(sentences): # 1. Remove punctuation sentences = tf.regex_replace(sentences, '[[:punct:]]', ' ') # 2. Split string tensor into component words words = tf.string_split(sentences) words = tf.sparse_tensor_to_dense(words, default_value=PADWORD) # 3. Map each word to respective integer table = tf.contrib.lookup.index_table_from_file( vocabulary_file=VOCAB_FILE_PATH, num_oov_buckets=0, vocab_size=None, default_value=0, # for words not in vocabulary (OOV) key_column_index=0, value_column_index=1, delimiter=',') numbers = table.lookup(words) return numbers
def module_fn_with_preprocessing(): """Spec function for a full-text embedding module with preprocessing.""" sentences = tf.placeholder(shape=[None], dtype=tf.string, name="sentences") # Perform a minimalistic text preprocessing by removing punctuation and # splitting on spaces. normalized_sentences = tf.regex_replace( input=sentences, pattern=r"\pP", rewrite="") tokens = tf.string_split(normalized_sentences, " ") # In case some of the input sentences are empty before or after # normalization, we will end up with empty rows. We do however want to # return embedding for every row, so we have to fill in the empty rows with # a default. tokens, _ = tf.sparse_fill_empty_rows(tokens, "") # In case all of the input sentences are empty before or after # normalization, we will end up with a SparseTensor with shape [?, 0]. After # filling in the empty rows we must ensure the shape is set properly to # [?, 1]. tokens = tf.sparse_reset_shape(tokens) embeddings_var = tf.get_variable( initializer=tf.zeros([vocab_size + num_oov_buckets, embeddings_dim]), name=EMBEDDINGS_VAR_NAME, dtype=tf.float32) lookup_table = tf.contrib.lookup.index_table_from_file( vocabulary_file=vocabulary_file, num_oov_buckets=num_oov_buckets, ) sparse_ids = tf.SparseTensor( indices=tokens.indices, values=lookup_table.lookup(tokens.values), dense_shape=tokens.dense_shape) combined_embedding = tf.nn.embedding_lookup_sparse( params=embeddings_var, sp_ids=sparse_ids, sp_weights=None, combiner="sqrtn") hub.add_signature("default", {"sentences": sentences}, {"default": combined_embedding})
def get_doc_fields(hparams): """ Each document field has a placeholder. The regex is to add whitespace on both sides of punctuations. :param hparams: hparams :param regex_replace_pattern: The regex pattern to add a white space before and after :return: """ doc_text_placeholders = [] doc_fields = [] tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK) for ftr_name in hparams.feature_names: if ftr_name.startswith('doc_'): # each document field is a placeholder (a string vector) placeholder = tf.placeholder(shape=[None], dtype=tf.string, name=ftr_name + "_placeholder") doc_text_placeholders.append(placeholder) one_doc_field = placeholder # add whitespace on both sides of punctuations if regex pattern is not None if hparams.regex_replace_pattern is not None: one_doc_field = tf.regex_replace( input=one_doc_field, pattern=hparams.regex_replace_pattern, rewrite=" \\1 ") one_doc_field = data_fn.process_text( one_doc_field, tf_vocab_table, hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0) one_doc_field = tf.expand_dims(one_doc_field, axis=0) doc_fields.append(one_doc_field) return doc_fields, doc_text_placeholders
def ComputeWer(hyps, refs): """Computes word errors in hypotheses relative to reference transcripts. Args: hyps: Hypotheses, represented as string tensors of shape [N]. refs: References, represented as string tensors of shape [N]. Returns: An int64 tensor, word_errs, of size [N, 2] where word_errs[i, 0] corresponds to the number of word errors in hyps[i] relative to refs[i]; word_errs[i, 1] corresponds to the number of words in refs[i]. """ def _NormalizeWhitespace(s): return tf.regex_replace(tf.strings.strip(s), r'\s+', ' ') hyps = _NormalizeWhitespace(hyps) refs = _NormalizeWhitespace(refs) hyps = py_utils.HasRank(hyps, 1) refs = py_utils.HasRank(refs, 1) hyps = py_utils.HasShape(hyps, tf.shape(refs)) word_errors = tf.to_int64( tf.edit_distance(tf.string_split(hyps), tf.string_split(refs), normalize=False)) # Count number of spaces in reference, and increment by 1 to get total number # of words. ref_words = tf.to_int64( tf.strings.length(tf.regex_replace(refs, '[^ ]', '')) + 1) # Set number of words to 0 if the reference was empty. ref_words = tf.where(tf.equal(refs, ''), tf.zeros_like(ref_words, tf.int64), ref_words) return tf.concat( [tf.expand_dims(word_errors, -1), tf.expand_dims(ref_words, -1)], axis=1)
def get_query(hparams): """ Helper function to get query and query_placeholder :param hparams: hparams :return: query and query_placeholder """ # query text feature # If hparams.add_first_dim_for_query_placeholder is True, the query placeholder has dimension [None] # This is to use the query feature as a document feature in model serving if hparams.add_first_dim_for_query_placeholder: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [None], tf.string, 'query', hparams.feature_names) else: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [], tf.string, 'query', hparams.feature_names) if query is not None: if hparams.add_first_dim_for_query_placeholder: # remove added dimension query = tf.squeeze(query, [0]) # tokenize query if hparams.regex_replace_pattern is not None: query = tf.regex_replace(input=query, pattern=hparams.regex_replace_pattern, rewrite=" \\1 ") query = data_fn.process_text( query, vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK), hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0) return query, query_placeholder
def get_query(hparams, regex_replace_pattern, add_dimension=False): """ Helper function to get query and query_placeholder :param hparams: hparams :param regex_replace_pattern: The regex pattern to add a white space before and after :param add_dimension: whether to add a dimension then remove to query (this is to support online model for QAP as quasar model serving requires at least one dimension) :return: query and query_placeholder """ # query text feature if add_dimension: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [None], tf.string, 'query', hparams.feature_names) else: query_placeholder, query = create_placeholder_for_ftrs( "query_placeholder", [], tf.string, 'query', hparams.feature_names) if query is not None: if add_dimension: # remove added dimension query = tf.squeeze(query, [0]) # tokenize query if regex_replace_pattern is not None: query = tf.regex_replace(input=query, pattern=regex_replace_pattern, rewrite=" \\1 ") query = data_fn.process_text( query, vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK), hparams.CLS, hparams.SEP, hparams.PAD, hparams.max_len, hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes)) return query, query_placeholder
def parse_raw_text(sentence): """Splits text tensor by word to sparse sequence of tokens. Args: sentence: `tf.string`, with text record to split. Returns: Dictionary mapping feature name to tensors with the following entries `constants.TOKENS` mapping to a `SparseTensor` and `constants.SEQUENCE_LENGTH` mapping to a one-dimensional integer `Tensor`. """ tokens = tf.regex_replace(sentence, _CHAR_TO_FILTER_OUT, ' ', replace_global=True) sparse_sequence = tf.string_split(tokens) features = { constants.TOKENS: sparse_sequence, constants.SEQUENCE_LENGTH: get_sparse_tensor_size(sparse_sequence) } return features
def encode_features(strings_tensor, table, n_vocab, max_len): """ Given a string tensor, generate a one hot representation for the model. The character splitting hack is due to this open tensorflow bug: https://github.com/tensorflow/tensorflow/pull/12971. To work around this, we interleave the string with a non printable character (BEEP). This character must consequently never be present in the source material. This character was chosen because text is highly unlikely to include BEEP characters, and also because it is < 128, which is required to make this hack work. """ ret = tf.regex_replace(strings_tensor, '.', '\\0%s' % SPLIT_CHAR) ret = tf.string_split(ret, delimiter=SPLIT_CHAR) ret = table.lookup(ret) ret = tf.sparse_tensor_to_dense(ret, default_value=0) ret = ret[:, 0:max_len] ret = tf.one_hot(ret, n_vocab) return ret