def preprocessing_fn_train(inputs):
                """Preprocess input columns into transformed columns."""
                context = inputs['Context']
                utterance = inputs['Utterance']
                vocab = tf.concat([context, utterance], 0)

                context_tokens = tf.compat.v1.string_split(context, DELIMITERS)
                utterance_tokens = tf.compat.v1.string_split(
                    utterance, DELIMITERS)
                vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS)

                vocab_mapping_file_path = tft.vocabulary(
                    vocab_tokens, vocab_filename='anantvir_train_vocab')

                mapped_context = tft.apply_vocabulary(
                    context_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                print(mapped_context)

                mapped_utterance = tft.apply_vocabulary(
                    utterance_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)

                return {
                    'Context': mapped_context,
                    'Utterance': mapped_utterance,
                }
コード例 #2
0
    def preprocessing_fn(inputs):
        """Preprocess input columns into transformed columns."""
        outputs = {}

        for key in numerical_feats:
            outputs[key] = tf.cast(tft.bucketize(inputs[key], 20),
                                   tf.float32) / 20.0 - 0.5

        outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0

        inputs["game_zone"] = tf.string_join(
            [inputs["sourceGameId"], inputs["zone"]], separator="_")
        inputs["game_campaignId"] = tf.string_join(
            [inputs["sourceGameId"], inputs["campaignId"]], separator="_")

        for key in categorical_feats + ["game_zone", "game_campaignId"]:
            vocab = tft.vocabulary(inputs[key],
                                   vocab_filename=key,
                                   frequency_threshold=100)
            outputs[key] = tft.apply_vocabulary(inputs[key],
                                                vocab,
                                                default_value=0)

        outputs["label"] = inputs["label"]
        outputs["key"] = inputs["key"]

        return outputs
コード例 #3
0
def preprocessing_fn(inputs):
    """Callback function for preprocessing inputs.

  Args:
    inputs: map from feature keys to raw not-yet-transformed features.

  Returns:
    Map from string feature key to transformed feature operations.
  """
    outputs = inputs.copy()

    # Compute a vocabulary based on the TOP-K current pages and labels seen in
    # the dataset.
    vocab = tft.vocabulary(tf.concat(
        [inputs[_CUR_PAGE_FEATURE_KEY], inputs[_LABEL_KEY]], axis=0),
                           top_k=_TOP_K,
                           vocab_filename=_VOCAB_FILENAME)

    # Apply the vocabulary to both the current page feature and the label,
    # converting the strings into integers.
    for k in [_CUR_PAGE_FEATURE_KEY, _LABEL_KEY]:
        # Out-of-vocab strings will be assigned the _TOP_K value.
        outputs[k] = tft.apply_vocabulary(inputs[k],
                                          vocab,
                                          default_value=_TOP_K)
    return outputs
コード例 #4
0
    def preprocessing_fn(
            inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]:
        """tf.transform's callback function for preprocessing inputs.

    Parameters
    ----------
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns
    -------
      Map from string feature key to transformed feature operations.

    """
        outputs = {}
        for key in categorical_feature_keys + [label_key]:
            outputs[_transformed_name(key)] = _fill_in_missing(inputs[key])

            vocab_file_tensor = tft.vocabulary(outputs[_transformed_name(key)],
                                               vocab_filename=key)

            outputs[_transformed_name(key)] = tft.apply_vocabulary(
                outputs[_transformed_name(key)], vocab_file_tensor)

        # NOTE: This won't be correct in the incremental case since it's only using
        # the new examples to get the mean and variance.
        for key in numerical_feature_keys:
            outputs[_transformed_name(key)] = tf.expand_dims(
                tft.scale_to_z_score(_fill_in_missing(inputs[key])), axis=1)

        return outputs
コード例 #5
0
ファイル: preprocess.py プロジェクト: CRSilkworth/path_nn
    def preprocessing_fn(
            inputs: Dict[Text, tf.Tensor]) -> Dict[Text, tf.Tensor]:
        """tf.transform's callback function for preprocessing inputs.

    Parameters
    ----------
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns
    -------
      Map from string feature key to transformed feature operations.

    """
        outputs = {}
        outputs[_transformed_name(label_key)] = _fill_in_missing(
            inputs[label_key])

        vocab_file_tensor = tft.vocabulary(
            outputs[_transformed_name(label_key)], vocab_filename=label_key)

        outputs[_transformed_name(label_key)] = tft.apply_vocabulary(
            outputs[_transformed_name(label_key)], vocab_file_tensor)

        outputs[_transformed_name(pixel_key)] = tf.concat(
            [_fill_in_missing(inputs[str(i + 1)]) for i in range(num_pixels)],
            axis=1)

        # NOTE: This won't be correct in the incremental case since it's only using
        # the new examples to get the mean and variance.
        outputs[_transformed_name('pixels')] = tft.scale_to_0_1(
            outputs[_transformed_name('pixels')])

        return outputs
コード例 #6
0
def _preprocess_tft(raw_data, user_freq, item_freq):
    """Creates vocabularies for users and items and maps their ids to ints.

  Args:
    raw_data: a dict of shape {$user_key: tensor, $item_key: tensor, ...}.
    user_freq: minimum frequency of a user to include it in the user vocab.
    item_freq: minimum frequency of an item to include it in the item vocab.

  Returns:
    A dict containing int ids cooresponding to a user_id and item_id and other
      features: {$user_key: $user_id, $item_key: $item_id, ...}.
  """
    features = {
        feature: raw_data[feature]
        for feature in constants.BQ_FEATURES
    }
    item_vocab = tft.vocabulary(raw_data[constants.ITEM_KEY],
                                vocab_filename=constants.ITEM_VOCAB_NAME,
                                frequency_threshold=item_freq)
    tft_features = {
        constants.TFT_USER_KEY:
        tft.compute_and_apply_vocabulary(
            raw_data[constants.USER_KEY],
            vocab_filename=constants.USER_VOCAB_NAME,
            frequency_threshold=user_freq,
            default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_ITEM_KEY:
        tft.apply_vocabulary(raw_data[constants.ITEM_KEY],
                             item_vocab,
                             default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_ARTIST_KEY:
        tft.compute_and_apply_vocabulary(
            raw_data[constants.ARTIST_KEY],
            vocab_filename=constants.ARTIST_VOCAB_NAME,
            default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_TAGS_KEY:
        tft.compute_and_apply_vocabulary(
            raw_data[constants.TAGS_KEY],
            vocab_filename=constants.TAG_VOCAB_NAME,
            default_value=constants.TFT_DEFAULT_ID),
        constants.TFT_TOP_10_KEY:
        tft.apply_vocabulary(raw_data[constants.TOP_10_KEY],
                             item_vocab,
                             default_value=constants.TFT_DEFAULT_ID),
    }
    features.update(tft_features)
    return features
コード例 #7
0
def preprocessing_fn(inputs):
    """
    Preprocess data inputs.
    This a callback function for tfx.components.Transform

    Parameters
    ----------
    inputs : dict, tensorflow_transform data
        Data beging fed into tfx.components.Transform
        Map from feature keys to raw not-yet-transformed features.

    Returns
    -------
    output: dict
        Map from string feature key to transformed feature operations.
    """
    # String to integer indexing
    content = inputs["InSeasonSeries_Id"]
    token = inputs["token"]
    token_count = inputs["token_count"]
    vocab_uri = tft.vocabulary(
        tf.concat([content, token], axis=0),
        vocab_filename="node_vocab.txt",
        name="node_vocab",
    )
    # Logging
    logging.info(f"graph vocabulary uri: {vocab_uri}")

    # output as a dict
    output = {}
    output["InSeasonSeries_Id"] = tft.apply_vocabulary(
        content, deferred_vocab_filename_tensor=vocab_uri, default_value=-1)
    output["token"] = tft.apply_vocabulary(
        token, deferred_vocab_filename_tensor=vocab_uri, default_value=-1)
    output["weight"] = tf.constant([1.0], dtype="float32") / tf.cast(
        token_count, "float32")

    return output
コード例 #8
0
def _default_preprocessing_fn(inputs, input_features):
    outputs = {}

    for key in input_features["numerical_default_encoding"]:
        outputs[key] = tf.cast(tft.bucketize(inputs[key], 20),
                               tf.float32) / 20.0 - 0.5

    for key in input_features["categorical_default_encoding"]:
        vocab = tft.vocabulary(inputs[key],
                               vocab_filename=key,
                               frequency_threshold=100)
        outputs[key] = tft.apply_vocabulary(inputs[key],
                                            vocab,
                                            default_value=0)

    if "label" in input_features:
        outputs["label"] = inputs[input_features["label"]]

    return outputs
コード例 #9
0
def preprocessing_fn(inputs, input_features):
    """Preprocess input columns into transformed columns."""

    outputs = _default_preprocessing_fn(inputs, input_features)

    outputs["campaignCost_mod"] = inputs["campaignCost"] / 100.0

    inputs["game_zone"] = tf.string_join(
        [inputs["sourceGameId"], inputs["zone"]], separator="_")
    inputs["game_campaignId"] = tf.string_join(
        [inputs["sourceGameId"], inputs["campaignId"]], separator="_")

    for key in ["game_zone", "game_campaignId"]:
        vocab = tft.vocabulary(inputs[key],
                               vocab_filename=key,
                               frequency_threshold=100)
        outputs[key] = tft.apply_vocabulary(inputs[key],
                                            vocab,
                                            default_value=0)

    outputs["key"] = inputs["key"]

    return outputs
コード例 #10
0
def apply_vocab_fn(inputs):
    """Preprocessing fn for sparse features.

  Applies vocab to bucketize sparse features. This function operates using
  previously-created vocab files.
  Pre-condition: Full vocab has been materialized.

  Args:
    inputs: Input features to transform.

  Returns:
    Output dict with transformed features.
  """
    outputs = {}

    outputs[LABEL_KEY] = inputs[LABEL_KEY]
    for key in NUMERIC_FEATURE_KEYS:
        outputs[key] = inputs[key]
    for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS):
        vocab_fn = os.path.join(args.temp_dir, "tftransform_tmp",
                                "feature_{}_vocab".format(idx))
        outputs[key] = tft.apply_vocabulary(inputs[key], vocab_fn)

    return outputs
            def preprocessing_fn_test(inputs):
                """Preprocess input columns into transformed columns."""
                context = inputs['Context']
                ground_truth_utterance = inputs['Ground Truth Utterance']
                distractor_0 = inputs['Distractor_0']
                distractor_1 = inputs['Distractor_1']
                distractor_2 = inputs['Distractor_2']
                distractor_3 = inputs['Distractor_3']
                distractor_4 = inputs['Distractor_4']
                distractor_5 = inputs['Distractor_5']
                distractor_6 = inputs['Distractor_6']
                distractor_7 = inputs['Distractor_7']
                distractor_8 = inputs['Distractor_8']
                vocab = tf.concat([
                    context, ground_truth_utterance, distractor_0,
                    distractor_1, distractor_2, distractor_3, distractor_4,
                    distractor_5, distractor_6, distractor_7, distractor_8
                ], 0)

                context_tokens = tf.compat.v1.string_split(context, DELIMITERS)
                ground_truth_utterance_tokens = tf.compat.v1.string_split(
                    ground_truth_utterance, DELIMITERS)
                distractor_0_tokens = tf.compat.v1.string_split(
                    distractor_0, DELIMITERS)
                distractor_1_tokens = tf.compat.v1.string_split(
                    distractor_1, DELIMITERS)
                distractor_2_tokens = tf.compat.v1.string_split(
                    distractor_2, DELIMITERS)
                distractor_3_tokens = tf.compat.v1.string_split(
                    distractor_3, DELIMITERS)
                distractor_4_tokens = tf.compat.v1.string_split(
                    distractor_4, DELIMITERS)
                distractor_5_tokens = tf.compat.v1.string_split(
                    distractor_5, DELIMITERS)
                distractor_6_tokens = tf.compat.v1.string_split(
                    distractor_6, DELIMITERS)
                distractor_7_tokens = tf.compat.v1.string_split(
                    distractor_7, DELIMITERS)
                distractor_8_tokens = tf.compat.v1.string_split(
                    distractor_8, DELIMITERS)

                vocab_tokens = tf.compat.v1.string_split(vocab, DELIMITERS)

                vocab_mapping_file_path = tft.vocabulary(
                    vocab_tokens, vocab_filename='anantvir_test_vocab')

                mapped_context = tft.apply_vocabulary(
                    context_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_ground_truth_utterance = tft.apply_vocabulary(
                    ground_truth_utterance_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_0 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_1 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_2 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_3 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_4 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_5 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_6 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_7 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)
                mapped_distractor_8 = tft.apply_vocabulary(
                    distractor_0_tokens,
                    deferred_vocab_filename_tensor=vocab_mapping_file_path)

                return {
                    'Context': mapped_context,
                    'Ground Truth Utterance': mapped_ground_truth_utterance,
                    'Distractor_0': mapped_distractor_0,
                    'Distractor_1': mapped_distractor_1,
                    'Distractor_2': mapped_distractor_2,
                    'Distractor_3': mapped_distractor_3,
                    'Distractor_4': mapped_distractor_4,
                    'Distractor_5': mapped_distractor_5,
                    'Distractor_6': mapped_distractor_6,
                    'Distractor_7': mapped_distractor_7,
                    'Distractor_8': mapped_distractor_8,
                }