Beispiel #1
0
 def from_tokens(raw, lookup_):
     gathered = tf.gather(lookup_, tf.cast(raw, tf.int32))
     joined = tf.regex_replace(tf.reduce_join(gathered, axis=1), b"<EOS>.*",
                               b"")
     cleaned = tf.regex_replace(joined, b"_", b" ")
     tokens = tf.string_split(cleaned, " ")
     return tokens
Beispiel #2
0
def _bpe2word_with_pad(tensor):
    tensor = tf.regex_replace(tensor, "<pad>", "▁⍷")
    joined_tensor = tf.reduce_join(tensor)
    replaced_tensor = tf.regex_replace(joined_tensor, "▁", " ")
    word_tensor = tf.string_split([replaced_tensor]).values
    pad = tf.tile(tf.constant(["⍷"]), [MAX_WORD_LEN - tf.shape(word_tensor)[0]])
    return tf.concat([word_tensor, pad], axis=-1)
Beispiel #3
0
    def _ComputeDecoderMetrics(self, decoder_outs, input_batch):
        """Computes metrics on output from decoder.

    Args:
      decoder_outs: A `BeamSearchDecodeOutput`, a namedtuple containing the
        decode results.
      input_batch:  A `NestedMap` of tensors representing the source, target,
        and other components of the input batch.

    Returns:
      A dict of Tensors containing decoder output and metrics.
    """
        p = self.params
        topk = self._GetTopK(decoder_outs)

        utt_ids = input_batch.sample_ids
        tgt = input_batch.tgt
        if p.target_key:
            tgt = input_batch.additional_tgts[p.target_key]
        transcripts = self.input_generator.IdsToStrings(
            tgt.labels,
            tf.cast(tf.reduce_sum(1.0 - tgt.paddings, 1) - 1.0, tf.int32))

        # Filter out all isolated '<noise>' tokens.
        noise_pattern = ' <noise> |^<noise> | <noise>$|^<noise>$'
        filtered_refs = tf.regex_replace(transcripts, noise_pattern, ' ')
        filtered_hyps = tf.regex_replace(topk.decoded, noise_pattern, ' ')
        # Compute translation quality scores for all hyps.
        filtered_refs = tf.tile(tf.reshape(filtered_refs, [-1, 1]),
                                [1, p.decoder.beam_search.num_hyps_per_beam])
        filtered_hyps = tf.reshape(filtered_hyps, [-1])
        filtered_refs = tf.reshape(filtered_refs, [-1])
        norm_wer_errors, norm_wer_words = self._ComputeNormalizedWER(
            filtered_hyps, filtered_refs)

        ret_dict = {
            'target_ids': tgt.ids,
            'target_labels': tgt.labels,
            'target_weights': tgt.weights,
            'target_paddings': tgt.paddings,
            'utt_id': utt_ids,
            'transcripts': transcripts,
            'topk_decoded': topk.decoded,
            'topk_ids': topk.ids,
            'topk_lens': topk.lens,
            'topk_scores': topk.scores,
            'norm_wer_errors': norm_wer_errors,
            'norm_wer_words': norm_wer_words,
        }

        ret_dict.update(
            self.AddAdditionalDecoderMetricsToGraph(topk, filtered_hyps,
                                                    filtered_refs, input_batch,
                                                    decoder_outs))
        return ret_dict
Beispiel #4
0
    def Decode(self):
        """Constructs the inference graph."""
        p = self.params
        with tf.name_scope('fprop'), tf.name_scope(p.name):
            batch = self.input_generator.GetPreprocessedInputBatch()
            src_enc, src_enc_padding, _ = self.encoder.FPropDefaultTheta(
                batch.src)

            if hasattr(self.decoder, 'contextualizer'):
                self.decoder.contextualizer.SetContextMap(batch.tgt)
            decoder_outs = self.decoder.BeamSearchDecode(
                src_enc, src_enc_padding)
            topk = self._GetTopK(decoder_outs)

            utt_ids = batch.sample_ids
            tgt = batch.tgt
            if p.target_key:
                tgt = batch.additional_tgts[p.target_key]
            transcripts = self.input_generator.IdsToStrings(
                tgt.labels,
                tf.cast(tf.reduce_sum(1.0 - tgt.paddings, 1) - 1.0, tf.int32))

            # Filter out all isolated '<noise>' tokens.
            noise_pattern = ' <noise> |^<noise> | <noise>$|^<noise>$'
            filtered_refs = tf.regex_replace(transcripts, noise_pattern, ' ')
            filtered_hyps = tf.regex_replace(topk.decoded, noise_pattern, ' ')
            # Compute translation quality scores for all hyps.
            filtered_refs = tf.tile(
                tf.reshape(filtered_refs, [-1, 1]),
                [1, p.decoder.beam_search.num_hyps_per_beam])
            filtered_hyps = tf.reshape(filtered_hyps, [-1])
            filtered_refs = tf.reshape(filtered_refs, [-1])
            norm_wer_errors, norm_wer_words = self._ComputeNormalizedWER(
                filtered_hyps, filtered_refs)

            ret_dict = {
                'target_ids': tgt.ids,
                'target_labels': tgt.labels,
                'target_weights': tgt.weights,
                'target_paddings': tgt.paddings,
                'utt_id': utt_ids,
                'transcripts': transcripts,
                'topk_decoded': topk.decoded,
                'topk_ids': topk.ids,
                'topk_lens': topk.lens,
                'topk_scores': topk.scores,
                'norm_wer_errors': norm_wer_errors,
                'norm_wer_words': norm_wer_words,
            }

            ret_dict.update(
                self.AddAdditionalDecoderMetricsToGraph(
                    topk, filtered_hyps, filtered_refs))
            return ret_dict
def load_img_cuhk(image_file, img_width, img_height, is_train):
    input_image = tf.read_file(image_file)
    input_image = tf.image.decode_jpeg(input_image, channels=3)

    target_img_file = tf.regex_replace(image_file, "sketches", "photos")
    target_img_file = tf.regex_replace(target_img_file, "-sz1", "")
    target_img_file = tf.regex_replace(target_img_file, "F2-", "f-")
    target_img_file = tf.regex_replace(target_img_file, "M2-", "m-")
    real_image = tf.read_file(target_img_file)
    real_image = tf.image.decode_jpeg(real_image)

    return _load_image(input_image, real_image, img_width, img_height, is_train)
Beispiel #6
0
def parse_example(parsed_features):
    label = parsed_features['comb/label']
    ins = parsed_features['example/input']
    inst1 = tf.regex_replace(parsed_features['comb/inst1'], ' ', '_')
    inst2 = tf.regex_replace(parsed_features['comb/inst2'], ' ', '_')
    type1 = parsed_features['comb/type1']
    type2 = parsed_features['comb/type2']
    file1 = parsed_features['comb/file1']
    file2 = parsed_features['comb/file2']
    genre = parsed_features['comb/genre']
    id = parsed_features['comb/id']

    return ins, label, tf.string_join([type1, ' x ', type2]), tf.string_join(
        [inst1, ' x ',
         inst2]), genre, id, tf.string_join([file1, ' x ', file2])
Beispiel #7
0
 def decode(line):
     fields = tf.string_split([line], self.field_delim).values
     if self.index:  # Skip index
         fields = fields[1:]
     fields = tf.regex_replace(fields, '|'.join(self.na_values), 'nan')
     fields = tf.string_to_number(fields, tf.float32)
     return fields
def load_img_celeba(image_file, img_width, img_height, is_train):
    input_image = tf.read_file(image_file)
    input_image = tf.image.decode_jpeg(input_image, channels=3)

    target_img_file = tf.regex_replace(image_file, "landmarks", "photos")
    real_image = tf.read_file(target_img_file)
    real_image = tf.image.decode_jpeg(real_image)

    return _load_image(input_image, real_image, img_width, img_height, is_train)
Beispiel #9
0
    def serving_input_fn():
        print('new serving_input_fn')
        # define placeholder for filename
        filename = tf.placeholder(dtype=tf.string)

        # TODO : make it batch-compatible (with Dataset or string input producer)

        if not channel_ids:
            decoded_image = tf.to_float(
                tf.image.decode_jpeg(tf.read_file(filename),
                                     channels=3,
                                     try_recover_truncated=True))
        else:
            first_channel = True
            if len(channel_ids) == 1:
                channelNum = 3
            else:
                channelNum = 1

            for id in channel_ids:

                channelname = tf.regex_replace(filename, '.png',
                                               separator + id + '.png')
                decoded_channel = tf.to_float(
                    tf.image.decode_jpeg(tf.read_file(channelname),
                                         channels=channelNum,
                                         try_recover_truncated=True))
                if first_channel:
                    decoded_image = decoded_channel
                    first_channel = False
                else:
                    decoded_image = tf.concat([decoded_image, decoded_channel],
                                              2)

        original_shape = tf.shape(decoded_image)[:2]

        if resized_size is not None and resized_size > 0:
            image = resize_image(decoded_image, resized_size)
        else:
            image = decoded_image

        image_batch = image[None]
        features = {'images': image_batch, 'original_shape': original_shape}

        receiver_inputs = {'filename': filename}

        input_from_resized_images = {'resized_images': image_batch}
        input_from_original_image = {'image': decoded_image}

        return tf.estimator.export.ServingInputReceiver(
            features,
            receiver_inputs,
            receiver_tensors_alternatives={
                'from_image': input_from_original_image,
                'from_resized_images': input_from_resized_images
            })
Beispiel #10
0
  def from_characters(raw, lookup_):
    """Convert ascii+2 encoded codes to string-tokens."""
    corrected = tf.bitcast(
        tf.clip_by_value(tf.subtract(raw, 2), 0, 255), tf.uint8)

    gathered = tf.gather(lookup_, tf.cast(corrected, tf.int32))[:, :, 0]
    joined = tf.reduce_join(gathered, axis=1)
    cleaned = tf.regex_replace(joined, b"\0", b"")
    tokens = tf.string_split(cleaned, " ")
    return tokens
Beispiel #11
0
def load_img_to_tensor(dict_type_to_imagepath):
    dict_res = {}
    for str_type, str_filepath in dict_type_to_imagepath.items():
        if str_type == 'labelM':
            try:
                kittipath = '/notebooks/dataset'
                #kittipath = os.environ['KITTIPATH']
                str_filepath = tf.regex_replace(str_filepath,
                                                tf.constant('\$KITTIPATH'),
                                                tf.constant(kittipath))

            except Exception:
                print(
                    "WARNING: KITTIPATH not defined - this may result in errors!"
                )
            tf_filepath = tf.read_file(str_filepath)
            tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint8)
            tf_tensor = tf.cast(tf_tensor, dtype=tf.float32)
            tf_tensor = tf.image.resize_image_with_crop_or_pad(
                tf_tensor, 352, 1216)

            dict_res[str_type] = tf_tensor
        else:
            try:
                kittipath = '/notebooks/dataset/'
                #kittipath = os.environ['KITTIPATH']
                str_filepath = tf.regex_replace(str_filepath,
                                                tf.constant('\$KITTIPATH'),
                                                tf.constant(kittipath))

            except Exception:
                print(
                    "WARNING: KITTIPATH not defined - this may result in errors!"
                )
            tf_filepath = tf.read_file(str_filepath)
            tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint16)
            tf_tensor = tf.cast(tf_tensor, dtype=tf.int32)
            tf_tensor = tf.image.resize_image_with_crop_or_pad(
                tf_tensor, 352, 1216)

            dict_res[str_type] = tf_tensor
    return dict_res
Beispiel #12
0
def parse_text_line(line, path):
    split = tf.string_split([line])
    image_filename = split.values[0]
    label_filename = split.values[1]
    pattern = "\/SegNet"
    image_filename = tf.regex_replace(image_filename, pattern, path)
    label_filename = tf.regex_replace(label_filename, pattern, path)

    image = get_image_tensor(image_filename, channels=3)
    label = get_image_tensor(label_filename, channels=1)

    image = tf.cast(image, tf.float32)
    label = tf.cast(label, tf.int32)
    image = image / 255.

    target_height = FLAGS.height
    target_width = FLAGS.width
    resized_image, resized_label = randomly_scale(
        image, label, [target_height, target_width])
    resized_label = replace_ignore_label(resized_label)
    return {'image': resized_image}, resized_label
Beispiel #13
0
    def _ComputeNormalizedWER(self, hyps, refs):
        # Filter out all '<epsilon>' tokens for norm_wer computation.
        hyps_no_epsilon = tf.regex_replace(hyps, '(<epsilon>)+', ' ')
        # norm_wer is size [num_transcripts * hyps_per_beam, 2]
        norm_wer = decoder_utils.ComputeWer(hyps_no_epsilon, refs)
        # Split into two tensors of size [num_transcripts * hyps_per_beam, 1]
        norm_wer_errors, norm_wer_words = tf.split(norm_wer, [1, 1], 1)
        shape = [-1, self.params.decoder.beam_search.num_hyps_per_beam]
        norm_wer_errors = tf.reshape(norm_wer_errors, shape)
        norm_wer_words = tf.reshape(norm_wer_words, shape)

        return norm_wer_errors, norm_wer_words
Beispiel #14
0
    def load_images(self, filename):
        image_string = tf.read_file(filename)
        image_decoded = tf.image.decode_png(image_string)
        image_resized = tf.image.resize_images(image_decoded,
                                               [self.img_size, self.img_size])

        mask_string = tf.read_file(tf.regex_replace(filename, "rgb", "mask"))
        mask_decoded = tf.image.decode_png(mask_string)
        mask_resized = tf.image.resize_images(mask_decoded,
                                              [self.img_size, self.img_size])
        mask_gray = tf.image.rgb_to_grayscale(mask_resized)

        depth_string = tf.regex_replace(filename, "rgb", "depth")
        depth_string = tf.regex_replace(depth_string, "shapenet",
                                        "shapenet_depth")
        depth_string = tf.read_file(depth_string)
        depth_decoded = tf.image.decode_png(depth_string)
        depth_resized = tf.image.resize_images(depth_decoded,
                                               [self.img_size, self.img_size])
        depth_gray = tf.image.rgb_to_grayscale(depth_resized)

        return image_resized, mask_gray, depth_gray, filename
Beispiel #15
0
def load_and_resize_image(filename: str,
                          channels: int,
                          size: int = None,
                          interpolation: str = 'BILINEAR',
                          channel_ids: list = (),
                          separator: str = []) -> tf.Tensor:
    """Loads an image from its filename and resizes it to the desired output size.

    :param filename: string tensor
    :param channels: number of channels for the decoded image
    :param size: number of desired pixels in the resized image, tf.Tensor or int (None for no resizing)
    :param interpolation:
    :param return_original_shape: returns the original shape of the image before resizing if this flag is True
    :return: decoded and resized float32 tensor [h, w, channels],
    """
    with tf.name_scope('load_img'):

        if not channel_ids:
            decoded_image = tf.to_float(
                tf.image.decode_jpeg(tf.read_file(filename),
                                     channels=channels,
                                     try_recover_truncated=True))
        else:
            first_channel = True
            if len(channel_ids) == 1:
                channelNum = 3
            else:
                channelNum = 1

            for id in channel_ids:
                channelname = tf.regex_replace(filename, '.png',
                                               separator + id + '.png')
                decoded_channel = tf.to_float(
                    tf.image.decode_jpeg(tf.read_file(channelname),
                                         channels=channelNum,
                                         try_recover_truncated=True))
                if first_channel:
                    decoded_image = decoded_channel
                    first_channel = False
                else:
                    decoded_image = tf.concat([decoded_image, decoded_channel],
                                              2)

        # TODO : if one side is smaller than size of patches (and make patches == true),
        # TODO : force the image to have at least patch size
        if size is not None and not (isinstance(size, int) and size <= 0):
            result_image = resize_image(decoded_image, size, interpolation)
        else:
            result_image = decoded_image

        return result_image
Beispiel #16
0
def get_usr_fields(hparams):
    """
    Each user field has a placeholder.
    The regex is to add whitespace on both sides of punctuations.
    :param hparams: hparams
    :return:
    """
    usr_text_placeholders = []
    usr_fields = []
    tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK)
    for ftr_name in hparams.feature_names:
        if ftr_name.startswith('usr_'):
            # If hparams.add_first_dim_for_usr_placeholder is True, the usr placeholders have dimension [None]
            # This is to use the usr field features as document features in model serving
            if hparams.add_first_dim_for_usr_placeholder:
                # each user field is a placeholder (one string)
                placeholder = tf.placeholder(shape=[None],
                                             dtype=tf.string,
                                             name=ftr_name + "_placeholder")
            else:
                placeholder = tf.placeholder(shape=[],
                                             dtype=tf.string,
                                             name=ftr_name + "_placeholder")
            usr_text_placeholders.append(placeholder)

            one_usr_field = placeholder
            # add whitespace on both sides of punctuations if regex pattern is not None
            if hparams.regex_replace_pattern is not None:
                one_usr_field = tf.regex_replace(
                    input=one_usr_field,
                    pattern=hparams.regex_replace_pattern,
                    rewrite=" \\1 ")

            # remove added dimension
            if hparams.add_first_dim_for_usr_placeholder:
                one_usr_field = tf.squeeze(one_usr_field, [0])
            one_usr_field = tf.expand_dims(one_usr_field, axis=0)
            one_usr_field = data_fn.process_text(
                one_usr_field,
                tf_vocab_table,
                hparams.CLS,
                hparams.SEP,
                hparams.PAD,
                hparams.max_len,
                hparams.min_len,
                cnn_filter_window_size=max(hparams.filter_window_sizes)
                if hparams.ftr_ext == 'cnn' else 0)
            usr_fields.append(one_usr_field)
    return usr_fields, usr_text_placeholders
Beispiel #17
0
    def module_fn_with_preprocessing():
        """Spec function for a full-text embedding module with preprocessing."""
        sentences = tf.placeholder(shape=[None],
                                   dtype=tf.string,
                                   name="sentences")
        # Perform a minimalistic text preprocessing by removing punctuation and
        # splitting on spaces.
        normalized_sentences = tf.regex_replace(input=sentences,
                                                pattern=r"\pP",
                                                rewrite="")
        tokens = tf.string_split(normalized_sentences, " ")

        embeddings_var = tf.get_variable(initializer=tf.zeros(
            [vocab_size + num_oov_buckets, embeddings_dim]),
                                         name=EMBEDDINGS_VAR_NAME,
                                         dtype=tf.float32)
        table_initializer = tf.lookup.TextFileInitializer(
            vocabulary_file, tf.string, tf.lookup.TextFileIndex.WHOLE_LINE,
            tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER)
        lookup_table = tf.lookup.StaticVocabularyTable(
            table_initializer, num_oov_buckets=num_oov_buckets)
        sparse_ids = tf.SparseTensor(indices=tokens.indices,
                                     values=lookup_table.lookup(tokens.values),
                                     dense_shape=tokens.dense_shape)

        # In case some of the input sentences are empty before or after
        # normalization, we will end up with empty rows. We do however want to
        # return embedding for every row, so we have to fill in the empty rows with
        # a default.
        sparse_ids, _ = tf.sparse_fill_empty_rows(
            sparse_ids, lookup_table.lookup(tf.constant("")))
        # In case all of the input sentences are empty before or after
        # normalization, we will end up with a SparseTensor with shape [?, 0]. After
        # filling in the empty rows we must ensure the shape is set properly to
        # [?, 1]. At this point, there are no empty rows, so the new shape will be
        # [sparse_ids.dense_shape[0], max(1, sparse_ids.dense_shape[1])].
        sparse_ids = tf.sparse_reset_shape(sparse_ids)

        combined_embedding = tf.nn.embedding_lookup_sparse(
            params=embeddings_var,
            sp_ids=sparse_ids,
            sp_weights=None,
            combiner="sqrtn")

        hub.add_signature("default", {"sentences": sentences},
                          {"default": combined_embedding})
Beispiel #18
0
def load_img_to_tensor(dict_type_to_imagepath):
    dict_res = {}
    for str_type, str_filepath in dict_type_to_imagepath.items():
        try:
            kittipath = os.environ['KITTIPATH']
            str_filepath = tf.regex_replace(str_filepath,
                                            tf.constant('\$KITTIPATH'),
                                            tf.constant(kittipath))
        except Exception:
            print(
                "WARNING: KITTIPATH not defined - this may result in errors!")
        tf_filepath = tf.read_file(str_filepath)
        tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint16)
        tf_tensor = tf.cast(tf_tensor, dtype=tf.int32)

        dict_res[str_type] = tf_tensor
    return dict_res
Beispiel #19
0
    def load_img_to_tensor(self, dict_type_to_imagepath):
        dict_res = {}
        for str_type, str_filepath in dict_type_to_imagepath.items():
            try:
                kittipath = os.environ['KITTIPATH']
                str_filepath = tf.regex_replace(str_filepath, tf.constant(
                    '\$KITTIPATH'), tf.constant(kittipath))
            except Exception:
                print("WARNING: KITTIPATH not defined - this may result in errors!")
            tf_filepath = tf.read_file(str_filepath)
            tf_tensor = tf.image.decode_png(tf_filepath, dtype=tf.uint16)
            tf_tensor = tf.image.resize_image_with_crop_or_pad(tf_tensor, self.parameters.image_size[0], self.parameters.image_size[1]) 
            tf_tensor = tf.cast(tf_tensor, dtype=tf.float32)
            tf_tensor = tf.divide(tf_tensor, 256.0)

            dict_res[str_type] = tf_tensor
        return dict_res
Beispiel #20
0
def vectorize_smile(data_dict, vocab, data_hparams):
    """Vectorize the SMILEs and generate the sequence inputs and labels."""
    # Fix the GO symbol and shift the seq_label.
    smile = data_dict["smile"]
    tokenizer = lambda x: true_smile_tokenizer(
        x, skip_at_symbol=data_hparams.skip_at_symbol)

    if data_hparams.skip_at_symbol:
        smile = tf.regex_replace(smile, "@", "")

    def py_func_tokenize_smile(smi):
        """Return a py_func for tokenizing SMILE string in tf tensors."""
        # Extract token nums
        tokens = sentence_to_token_ids(smi,
                                       vocabulary=vocab,
                                       tokenizer=tokenizer)
        tokens = np.array(tokens, dtype=np.int32)
        # truncate if needed.
        if len(tokens) > (data_hparams.max_seq_len - 1):
            # Truncate the sequence with a space for EOS_ID
            tokens = tokens[:(data_hparams.max_seq_len - 1)]
        return tokens

    # Raw encode of the SMILEs.
    tokens = tf.py_func(py_func_tokenize_smile, [smile], tf.int32)
    tokens.set_shape((None, ))
    seq_len = tf.shape(tokens)[0] + 1
    # Save the seq_labels. [seq_length]
    seq_labels = tf.concat(
        [tokens, tf.constant([vocab.EOS_ID], dtype=tokens.dtype)], -1)
    # Produce inputs.
    seq_inputs = tf.concat(
        [tf.constant([vocab.GO_ID], dtype=tokens.dtype), tokens], -1)
    # One-hot each vector. -> [? (seq_length), TOK_DIM]
    seq_inputs = tf.one_hot(seq_inputs, len(vocab), dtype=tf.float32)
    # One-hot encoder inputs. -> [? (seq_length), TOK_DIM]
    encoder_inputs = tf.one_hot(tokens, len(vocab), dtype=tf.float32)
    return {
        "smile": smile,
        "decoder_lens": seq_len,
        "decoder_inputs": seq_inputs,
        "decoder_labels": seq_labels,
        "encoder_inputs": encoder_inputs
    }
Beispiel #21
0
def preprocessing_fn(inputs):
    """
    Preprocess input columns into transformed columns.
    Args:
        inputs (dict): dict of input columns
    Returns:
        output dict of transformed columns
    """
    outputs = {}
    # Encode categorical column:
    outputs['MixingSpeed'] = tft.compute_and_apply_vocabulary(
        inputs['MixingSpeed'])
    outputs['ButterMass'] = inputs['ButterMass']
    # Calculate Derived Features:
    outputs['TotalMass'] = inputs['ButterMass'] + inputs['SugarMass'] + inputs[
        'FlourMass']
    for ingredient in ['Butter', 'Sugar', 'Flour']:
        ingredient_percentage = inputs['{}Mass'.format(
            ingredient)] / outputs['TotalMass']
        outputs['Norm{}perc'.format(ingredient)] = tft.scale_to_z_score(
            ingredient_percentage)
    # Keep absolute numeric columns
    for key in ['TotalVolume', 'Energy']:
        outputs[key] = inputs[key]
    # Normalize other numeric columns
    for key in [
            'ButterTemperature',
            'SugarHumidity',
            'FlourHumidity',
            'HeatingTime',
            'MixingTime',
            'Density',
            'Temperature',
            'Humidity',
    ]:
        outputs[key] = tft.scale_to_z_score(inputs[key])
    # Extract Specific Problems
    chunks_detected_str = tf.regex_replace(input=inputs['Problems'],
                                           pattern='.*chunk.*',
                                           rewrite='chunk',
                                           name='DetectChunk')
    outputs['Chunks'] = tf.cast(tf.equal(chunks_detected_str, 'chunk'),
                                tf.float32)
    return outputs
Beispiel #22
0
    def add_id_lookups(self):
        table = lookup.index_table_from_tensor(mapping=tf.constant(['']),
                                               default_value=1)

        sentences_shape = tf.shape(self.padded_sentences, out_type=tf.int64)

        removed_char_sentences = remove_unknown_chars(self.padded_sentences,
                                                      self.char_table)
        split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]),
                                      delimiter="")
        dense_split_words = tf.sparse_tensor_to_dense(split_words,
                                                      default_value='')

        max_word_len = tf.gather_nd(split_words.dense_shape, tf.constant([1]))
        chars_shape = tf.concat([sentences_shape, [max_word_len]], 0)

        chars = tf.reshape(dense_split_words, chars_shape)

        self.word_lengths = tf.reduce_sum(table.lookup(chars), 2)

        lowercase_sentences = lowercase(self.padded_sentences)
        sanitised_sentences = tf.regex_replace(lowercase_sentences, '^[0-9]+$',
                                               NUM)

        self.sequence_lengths = tf.reduce_sum(
            table.lookup(sanitised_sentences), 1)

        self.word_ids = self.word_table.lookup(sanitised_sentences)
        self.char_ids = self.char_table.lookup(chars)

        word_mask = tf.sequence_mask(self.sequence_lengths)
        char_mask = tf.sequence_mask(self.word_lengths)

        self.word_ids = tf.where(word_mask, self.word_ids,
                                 tf.zeros_like(self.word_ids))
        self.char_ids = tf.where(char_mask, self.char_ids,
                                 tf.zeros_like(self.char_ids))

        label_lengths = tf.reduce_sum(table.lookup(self.label_codes), 1)
        labels_mask = tf.sequence_mask(label_lengths)
        self.labels = self.label_table.lookup(self.label_codes)
        self.labels = tf.where(labels_mask, self.labels,
                               tf.zeros_like(self.labels))
    def module_fn_with_preprocessing():  #支持全文本输入,带有预处理的模型
        sentences = tf.placeholder(shape=[None],
                                   dtype=tf.string,
                                   name="sentences")

        #使用正则表达式,删除特殊符号
        normalized_sentences = tf.regex_replace(input=sentences,
                                                pattern=r"\pP",
                                                rewrite="")
        #按照空格分词,得到稀疏矩阵
        tokens = tf.string_split(normalized_sentences, " ")

        embeddings_var = tf.get_variable(  #定义词嵌入变量
            initializer=tf.zeros(
                [vocab_size + num_oov_buckets, embeddings_dim]),
            name='embedding',
            dtype=tf.float32)

        #用字典将词变为词向量
        lookup_table = tf.contrib.lookup.index_table_from_file(
            vocabulary_file=vocabulary_file, num_oov_buckets=num_oov_buckets)

        #将稀疏矩阵用词嵌入转化
        sparse_ids = tf.SparseTensor(indices=tokens.indices,
                                     values=lookup_table.lookup(tokens.values),
                                     dense_shape=tokens.dense_shape)

        #为稀疏矩阵添加空行
        sparse_ids, _ = tf.sparse_fill_empty_rows(
            sparse_ids, lookup_table.lookup(tf.constant("")))

        #sparse_ids = tf.sparse_reset_shape(sparse_ids)
        #结果进行平方和再开根号的规约计算
        combined_embedding = tf.nn.embedding_lookup_sparse(
            params=embeddings_var,
            sp_ids=sparse_ids,
            sp_weights=None,
            combiner="sqrtn")

        #默认都统一使用default签名。如果额外指定,还需要在调用时与其对应
        #输入和输出需要字典形式。可以是多个
        hub.add_signature("default", {"sentences": sentences},
                          {"default": combined_embedding})
Beispiel #24
0
def vectorize_sentences(sentences):
    # 1. Remove punctuation
    sentences = tf.regex_replace(sentences, '[[:punct:]]', ' ')

    # 2. Split string tensor into component words
    words = tf.string_split(sentences)
    words = tf.sparse_tensor_to_dense(words, default_value=PADWORD)

    # 3. Map each word to respective integer
    table = tf.contrib.lookup.index_table_from_file(
        vocabulary_file=VOCAB_FILE_PATH,
        num_oov_buckets=0,
        vocab_size=None,
        default_value=0,  # for words not in vocabulary (OOV)
        key_column_index=0,
        value_column_index=1,
        delimiter=',')
    numbers = table.lookup(words)

    return numbers
def vectorize_sentences(sentences):
    # 1. Remove punctuation
    sentences = tf.regex_replace(sentences, '[[:punct:]]', ' ')

    # 2. Split string tensor into component words
    words = tf.string_split(sentences)
    words = tf.sparse_tensor_to_dense(words, default_value=PADWORD)

    # 3. Map each word to respective integer
    table = tf.contrib.lookup.index_table_from_file(
        vocabulary_file=VOCAB_FILE_PATH,
        num_oov_buckets=0,
        vocab_size=None,
        default_value=0,  # for words not in vocabulary (OOV)
        key_column_index=0,
        value_column_index=1,
        delimiter=',')
    numbers = table.lookup(words)

    return numbers
Beispiel #26
0
  def module_fn_with_preprocessing():
    """Spec function for a full-text embedding module with preprocessing."""
    sentences = tf.placeholder(shape=[None], dtype=tf.string, name="sentences")
    # Perform a minimalistic text preprocessing by removing punctuation and
    # splitting on spaces.
    normalized_sentences = tf.regex_replace(
        input=sentences, pattern=r"\pP", rewrite="")
    tokens = tf.string_split(normalized_sentences, " ")

    # In case some of the input sentences are empty before or after
    # normalization, we will end up with empty rows. We do however want to
    # return embedding for every row, so we have to fill in the empty rows with
    # a default.
    tokens, _ = tf.sparse_fill_empty_rows(tokens, "")
    # In case all of the input sentences are empty before or after
    # normalization, we will end up with a SparseTensor with shape [?, 0]. After
    # filling in the empty rows we must ensure the shape is set properly to
    # [?, 1].
    tokens = tf.sparse_reset_shape(tokens)

    embeddings_var = tf.get_variable(
        initializer=tf.zeros([vocab_size + num_oov_buckets, embeddings_dim]),
        name=EMBEDDINGS_VAR_NAME,
        dtype=tf.float32)
    lookup_table = tf.contrib.lookup.index_table_from_file(
        vocabulary_file=vocabulary_file,
        num_oov_buckets=num_oov_buckets,
    )
    sparse_ids = tf.SparseTensor(
        indices=tokens.indices,
        values=lookup_table.lookup(tokens.values),
        dense_shape=tokens.dense_shape)

    combined_embedding = tf.nn.embedding_lookup_sparse(
        params=embeddings_var,
        sp_ids=sparse_ids,
        sp_weights=None,
        combiner="sqrtn")

    hub.add_signature("default", {"sentences": sentences},
                      {"default": combined_embedding})
Beispiel #27
0
def get_doc_fields(hparams):
    """
    Each document field has a placeholder.
    The regex is to add whitespace on both sides of punctuations.
    :param hparams: hparams
    :param regex_replace_pattern: The regex pattern to add a white space before and after
    :return:
    """
    doc_text_placeholders = []
    doc_fields = []
    tf_vocab_table = vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK)
    for ftr_name in hparams.feature_names:
        if ftr_name.startswith('doc_'):
            # each document field is a placeholder (a string vector)
            placeholder = tf.placeholder(shape=[None],
                                         dtype=tf.string,
                                         name=ftr_name + "_placeholder")
            doc_text_placeholders.append(placeholder)

            one_doc_field = placeholder
            # add whitespace on both sides of punctuations if regex pattern is not None
            if hparams.regex_replace_pattern is not None:
                one_doc_field = tf.regex_replace(
                    input=one_doc_field,
                    pattern=hparams.regex_replace_pattern,
                    rewrite=" \\1 ")
            one_doc_field = data_fn.process_text(
                one_doc_field,
                tf_vocab_table,
                hparams.CLS,
                hparams.SEP,
                hparams.PAD,
                hparams.max_len,
                hparams.min_len,
                cnn_filter_window_size=max(hparams.filter_window_sizes)
                if hparams.ftr_ext == 'cnn' else 0)
            one_doc_field = tf.expand_dims(one_doc_field, axis=0)
            doc_fields.append(one_doc_field)
    return doc_fields, doc_text_placeholders
Beispiel #28
0
def ComputeWer(hyps, refs):
    """Computes word errors in hypotheses relative to reference transcripts.

  Args:
    hyps: Hypotheses, represented as string tensors of shape [N].
    refs: References, represented as string tensors of shape [N].

  Returns:
    An int64 tensor, word_errs, of size [N, 2] where word_errs[i, 0] corresponds
    to the number of word errors in hyps[i] relative to refs[i]; word_errs[i, 1]
    corresponds to the number of words in refs[i].
  """
    def _NormalizeWhitespace(s):
        return tf.regex_replace(tf.strings.strip(s), r'\s+', ' ')

    hyps = _NormalizeWhitespace(hyps)
    refs = _NormalizeWhitespace(refs)

    hyps = py_utils.HasRank(hyps, 1)
    refs = py_utils.HasRank(refs, 1)
    hyps = py_utils.HasShape(hyps, tf.shape(refs))

    word_errors = tf.to_int64(
        tf.edit_distance(tf.string_split(hyps),
                         tf.string_split(refs),
                         normalize=False))

    # Count number of spaces in reference, and increment by 1 to get total number
    # of words.
    ref_words = tf.to_int64(
        tf.strings.length(tf.regex_replace(refs, '[^ ]', '')) + 1)
    # Set number of words to 0 if the reference was empty.
    ref_words = tf.where(tf.equal(refs, ''),
                         tf.zeros_like(ref_words, tf.int64), ref_words)

    return tf.concat(
        [tf.expand_dims(word_errors, -1),
         tf.expand_dims(ref_words, -1)],
        axis=1)
Beispiel #29
0
def get_query(hparams):
    """
    Helper function to get query and query_placeholder
    :param hparams: hparams
    :return: query and query_placeholder
    """
    # query text feature
    # If hparams.add_first_dim_for_query_placeholder is True, the query placeholder has dimension [None]
    # This is to use the query feature as a document feature in model serving
    if hparams.add_first_dim_for_query_placeholder:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [None], tf.string, 'query',
            hparams.feature_names)
    else:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [], tf.string, 'query', hparams.feature_names)
    if query is not None:
        if hparams.add_first_dim_for_query_placeholder:
            # remove added dimension
            query = tf.squeeze(query, [0])

        # tokenize query
        if hparams.regex_replace_pattern is not None:
            query = tf.regex_replace(input=query,
                                     pattern=hparams.regex_replace_pattern,
                                     rewrite=" \\1 ")

        query = data_fn.process_text(
            query,
            vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK),
            hparams.CLS,
            hparams.SEP,
            hparams.PAD,
            hparams.max_len,
            hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0)
    return query, query_placeholder
Beispiel #30
0
def get_query(hparams, regex_replace_pattern, add_dimension=False):
    """
    Helper function to get query and query_placeholder
    :param hparams: hparams
    :param regex_replace_pattern: The regex pattern to add a white space before and after
    :param add_dimension: whether to add a dimension then remove to query (this is to support online model for QAP as
    quasar model serving requires at least one dimension)
    :return: query and query_placeholder
    """
    # query text feature
    if add_dimension:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [None], tf.string, 'query',
            hparams.feature_names)
    else:
        query_placeholder, query = create_placeholder_for_ftrs(
            "query_placeholder", [], tf.string, 'query', hparams.feature_names)
    if query is not None:
        if add_dimension:
            # remove added dimension
            query = tf.squeeze(query, [0])

        # tokenize query
        if regex_replace_pattern is not None:
            query = tf.regex_replace(input=query,
                                     pattern=regex_replace_pattern,
                                     rewrite=" \\1 ")

        query = data_fn.process_text(
            query,
            vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.UNK),
            hparams.CLS,
            hparams.SEP,
            hparams.PAD,
            hparams.max_len,
            hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes))
    return query, query_placeholder
def parse_raw_text(sentence):
    """Splits text tensor by word to sparse sequence of tokens.

  Args:
    sentence: `tf.string`, with text record to split.

  Returns:
    Dictionary mapping feature name to tensors with the following entries
    `constants.TOKENS` mapping to a `SparseTensor` and
    `constants.SEQUENCE_LENGTH` mapping to a one-dimensional integer `Tensor`.

  """

    tokens = tf.regex_replace(sentence,
                              _CHAR_TO_FILTER_OUT,
                              ' ',
                              replace_global=True)
    sparse_sequence = tf.string_split(tokens)
    features = {
        constants.TOKENS: sparse_sequence,
        constants.SEQUENCE_LENGTH: get_sparse_tensor_size(sparse_sequence)
    }
    return features
Beispiel #32
0
def encode_features(strings_tensor, table, n_vocab, max_len):
    """
    Given a string tensor, generate a one hot representation for the model.

    The character splitting hack is due to this open tensorflow bug:

        https://github.com/tensorflow/tensorflow/pull/12971.

    To work around this, we interleave the string with a non printable
    character (BEEP). This character must consequently never be present
    in the source material. This character was chosen because text is highly
    unlikely to include BEEP characters, and also because it is < 128,
    which is required to make this hack work.
    """

    ret = tf.regex_replace(strings_tensor, '.', '\\0%s' % SPLIT_CHAR)
    ret = tf.string_split(ret, delimiter=SPLIT_CHAR)
    ret = table.lookup(ret)
    ret = tf.sparse_tensor_to_dense(ret, default_value=0)
    ret = ret[:, 0:max_len]
    ret = tf.one_hot(ret, n_vocab)

    return ret