コード例 #1
0
    def feature_data(column, metadata, preprocessing_parameters):
        char_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata['char_str2idx'],
            tokenizer_type=preprocessing_parameters['char_tokenizer'],
            length_limit=metadata['char_max_sequence_length'],
            padding_symbol=preprocessing_parameters['padding_symbol'],
            padding=preprocessing_parameters['padding'],
            unknown_symbol=preprocessing_parameters['unknown_symbol'],
            lowercase=preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters['char_vocab_file'],
        )
        word_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata['word_str2idx'],
            tokenizer_type=preprocessing_parameters['word_tokenizer'],
            length_limit=metadata['word_max_sequence_length'],
            padding_symbol=preprocessing_parameters['padding_symbol'],
            padding=preprocessing_parameters['padding'],
            unknown_symbol=preprocessing_parameters['unknown_symbol'],
            lowercase=preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters['word_vocab_file'],
        )

        return char_data, word_data
コード例 #2
0
ファイル: text_feature.py プロジェクト: yunasystems/ludwig
    def feature_data(column, metadata, preprocessing_parameters, backend):
        char_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata['char_str2idx'],
            tokenizer_type=preprocessing_parameters['char_tokenizer'],
            length_limit=metadata['char_max_sequence_length'],
            padding_symbol=metadata['char_pad_symbol'],
            padding=preprocessing_parameters['padding'],
            unknown_symbol=metadata['char_unk_symbol'],
            lowercase=preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters['char_vocab_file'],
            pretrained_model_name_or_path=preprocessing_parameters[
                'pretrained_model_name_or_path'],
            processor=backend.df_engine)
        word_data = build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata['word_str2idx'],
            tokenizer_type=preprocessing_parameters['word_tokenizer'],
            length_limit=metadata['word_max_sequence_length'],
            padding_symbol=metadata['word_pad_symbol'],
            padding=preprocessing_parameters['padding'],
            unknown_symbol=metadata['word_unk_symbol'],
            lowercase=preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters['word_vocab_file'],
            pretrained_model_name_or_path=preprocessing_parameters[
                'pretrained_model_name_or_path'],
            processor=backend.df_engine)

        return char_data, word_data
コード例 #3
0
    def feature_data(column, metadata, preprocessing_parameters):
        char_data = build_sequence_matrix(
            column,
            metadata['char_str2idx'],
            preprocessing_parameters['char_tokenizer'],
            metadata['char_max_sequence_length'],
            preprocessing_parameters['padding_symbol'],
            preprocessing_parameters['padding'],
            preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters[
                'char_tokenizer_vocab_file'],
        )
        word_data = build_sequence_matrix(
            column,
            metadata['word_str2idx'],
            preprocessing_parameters['word_tokenizer'],
            metadata['word_max_sequence_length'],
            preprocessing_parameters['padding_symbol'],
            preprocessing_parameters['padding'],
            preprocessing_parameters['lowercase'],
            tokenizer_vocab_file=preprocessing_parameters[
                'word_tokenizer_vocab_file'],
        )

        return char_data, word_data
コード例 #4
0
 def feature_data(column, metadata, preprocessing_parameters):
     sequence_data = build_sequence_matrix(
         column, metadata['str2idx'], preprocessing_parameters['format'],
         metadata['max_sequence_length'],
         preprocessing_parameters['padding_symbol'],
         preprocessing_parameters['padding'],
         preprocessing_parameters['lowercase'])
     return sequence_data
コード例 #5
0
    def feature_data(column, metadata, preprocessing_parameters):
        char_data = build_sequence_matrix(
            column, metadata['char_str2idx'],
            preprocessing_parameters['char_format'],
            metadata['char_max_sequence_length'],
            preprocessing_parameters['padding_symbol'],
            preprocessing_parameters['padding'],
            preprocessing_parameters['lowercase'])
        word_data = build_sequence_matrix(
            column, metadata['word_str2idx'],
            preprocessing_parameters['word_format'],
            metadata['word_max_sequence_length'],
            preprocessing_parameters['padding_symbol'],
            preprocessing_parameters['padding'],
            preprocessing_parameters['lowercase'])

        return char_data, word_data
コード例 #6
0
 def feature_data(column, metadata, preprocessing_parameters, backend):
     sequence_data = build_sequence_matrix(
         sequences=column,
         inverse_vocabulary=metadata['str2idx'],
         tokenizer_type=preprocessing_parameters['tokenizer'],
         length_limit=metadata['max_sequence_length'],
         padding_symbol=preprocessing_parameters['padding_symbol'],
         padding=preprocessing_parameters['padding'],
         unknown_symbol=preprocessing_parameters['unknown_symbol'],
         lowercase=preprocessing_parameters['lowercase'],
         tokenizer_vocab_file=preprocessing_parameters['vocab_file'],
         processor=backend.df_engine)
     return sequence_data
コード例 #7
0
ファイル: sequence_feature.py プロジェクト: ludwig-ai/ludwig
 def feature_data(column, metadata, preprocessing_parameters, backend):
     sequence_data = build_sequence_matrix(
         sequences=column,
         inverse_vocabulary=metadata["str2idx"],
         tokenizer_type=preprocessing_parameters["tokenizer"],
         length_limit=metadata["max_sequence_length"],
         padding_symbol=preprocessing_parameters["padding_symbol"],
         padding=preprocessing_parameters["padding"],
         unknown_symbol=preprocessing_parameters["unknown_symbol"],
         lowercase=preprocessing_parameters["lowercase"],
         tokenizer_vocab_file=preprocessing_parameters["vocab_file"],
         processor=backend.df_engine,
     )
     return sequence_data
コード例 #8
0
ファイル: test_strings_utils.py プロジェクト: yarenty/ludwig
def test_build_sequence_matrix():
    inverse_vocabulary = {
        "<EOS>": 0,
        "<SOS>": 1,
        "<PAD>": 2,
        "<UNK>": 3,
        "a": 4,
        "b": 5,
        "c": 6,
    }
    sequences = pd.core.series.Series(["a b c", "c b a"])
    sequence_matrix = strings_utils.build_sequence_matrix(
        sequences, inverse_vocabulary, tokenizer_type="space", length_limit=10)
    assert not (sequence_matrix.tolist() -
                np.array([[1, 4, 5, 6, 0, 2, 2, 2, 2, 2],
                          [1, 6, 5, 4, 0, 2, 2, 2, 2, 2]])).any()
コード例 #9
0
ファイル: text_feature.py プロジェクト: ludwig-ai/ludwig
    def feature_data(column, metadata, preprocessing_parameters, backend):
        # TODO(1891): Remove backward compatibility hack once all models have been retrained with Ludwig after
        # https://github.com/ludwig-ai/ludwig/pull/1859.
        prefix = ""
        padding_symbol_metadata_key = "padding_symbol"
        unknown_symbol_metadata_key = "unknown_symbol"
        if "str2idx" not in metadata:
            prefix = "word_"
            padding_symbol_metadata_key = "word_pad_symbol"
            unknown_symbol_metadata_key = "word_unk_symbol"

        # ensure preprocessing param values match the metadata determined from dataset
        preprocessing_parameters["padding_symbol"] = metadata[
            padding_symbol_metadata_key]
        preprocessing_parameters["unknown_symbol"] = metadata[
            unknown_symbol_metadata_key]
        if preprocessing_parameters["fill_value"] == UNKNOWN_SYMBOL:
            preprocessing_parameters["fill_value"] = preprocessing_parameters[
                "unknown_symbol"]
        if ("computed_fill_value" in preprocessing_parameters
                and preprocessing_parameters["computed_fill_value"]
                == UNKNOWN_SYMBOL):
            preprocessing_parameters[
                "computed_fill_value"] = preprocessing_parameters[
                    "unknown_symbol"]

        return build_sequence_matrix(
            sequences=column,
            inverse_vocabulary=metadata[f"{prefix}str2idx"],
            tokenizer_type=preprocessing_parameters[f"{prefix}tokenizer"],
            length_limit=metadata[f"{prefix}max_sequence_length"],
            padding_symbol=metadata[padding_symbol_metadata_key],
            padding=preprocessing_parameters["padding"],
            unknown_symbol=metadata[unknown_symbol_metadata_key],
            lowercase=preprocessing_parameters["lowercase"],
            tokenizer_vocab_file=preprocessing_parameters[
                f"{prefix}vocab_file"],
            pretrained_model_name_or_path=preprocessing_parameters[
                "pretrained_model_name_or_path"],
            processor=backend.df_engine,
        )