Esempio n. 1
0
 def feature_meta(column, most_common_characters, most_common_words,
                  lowercase):
     (
         char_idx2str,
         char_str2idx,
         char_str2freq,
         char_max_len
     ) = create_vocabulary(
         column,
         'characters',
         num_most_frequent=most_common_characters,
         lowercase=lowercase
     )
     (
         word_idx2str,
         word_str2idx,
         word_str2freq,
         word_max_len
     ) = create_vocabulary(
         column,
         'english_tokenize',
         num_most_frequent=most_common_words,
         lowercase=lowercase
     )
     return (
         char_idx2str,
         char_str2idx,
         char_str2freq,
         char_max_len,
         word_idx2str,
         word_str2idx,
         word_str2freq,
         word_max_len
     )
Esempio n. 2
0
 def feature_meta(column, preprocessing_parameters, backend):
     (
         char_idx2str,
         char_str2idx,
         char_str2freq,
         char_max_len,
         char_99ptile_max_len,
         char_pad_idx,
         char_pad_symbol,
         char_unk_symbol,
     ) = create_vocabulary(
         column,
         tokenizer_type="characters",
         num_most_frequent=preprocessing_parameters["char_most_common"],
         lowercase=preprocessing_parameters["lowercase"],
         unknown_symbol=preprocessing_parameters["unknown_symbol"],
         padding_symbol=preprocessing_parameters["padding_symbol"],
         pretrained_model_name_or_path=preprocessing_parameters[
             "pretrained_model_name_or_path"],
         processor=backend.df_engine,
     )
     (
         word_idx2str,
         word_str2idx,
         word_str2freq,
         word_max_len,
         word_99ptile_max_len,
         word_pad_idx,
         word_pad_symbol,
         word_unk_symbol,
     ) = create_vocabulary(
         column,
         tokenizer_type=preprocessing_parameters["word_tokenizer"],
         num_most_frequent=preprocessing_parameters["word_most_common"],
         lowercase=preprocessing_parameters["lowercase"],
         vocab_file=preprocessing_parameters["word_vocab_file"],
         unknown_symbol=preprocessing_parameters["unknown_symbol"],
         padding_symbol=preprocessing_parameters["padding_symbol"],
         pretrained_model_name_or_path=preprocessing_parameters[
             "pretrained_model_name_or_path"],
         processor=backend.df_engine,
     )
     return (
         char_idx2str,
         char_str2idx,
         char_str2freq,
         char_max_len,
         char_99ptile_max_len,
         char_pad_idx,
         char_pad_symbol,
         char_unk_symbol,
         word_idx2str,
         word_str2idx,
         word_str2freq,
         word_max_len,
         word_99ptile_max_len,
         word_pad_idx,
         word_pad_symbol,
         word_unk_symbol,
     )
Esempio n. 3
0
 def feature_meta(column, preprocessing_parameters, backend):
     (
         char_idx2str,
         char_str2idx,
         char_str2freq,
         char_max_len,
         char_pad_idx,
         char_pad_symbol,
         char_unk_symbol,
     ) = create_vocabulary(
         column,
         tokenizer_type='characters',
         num_most_frequent=preprocessing_parameters['char_most_common'],
         lowercase=preprocessing_parameters['lowercase'],
         unknown_symbol=preprocessing_parameters['unknown_symbol'],
         padding_symbol=preprocessing_parameters['padding_symbol'],
         pretrained_model_name_or_path=preprocessing_parameters[
             'pretrained_model_name_or_path'],
         processor=backend.df_engine
     )
     (
         word_idx2str,
         word_str2idx,
         word_str2freq,
         word_max_len,
         word_pad_idx,
         word_pad_symbol,
         word_unk_symbol,
     ) = create_vocabulary(
         column,
         tokenizer_type=preprocessing_parameters['word_tokenizer'],
         num_most_frequent=preprocessing_parameters['word_most_common'],
         lowercase=preprocessing_parameters['lowercase'],
         vocab_file=preprocessing_parameters['word_vocab_file'],
         unknown_symbol=preprocessing_parameters['unknown_symbol'],
         padding_symbol=preprocessing_parameters['padding_symbol'],
         pretrained_model_name_or_path=preprocessing_parameters[
             'pretrained_model_name_or_path'],
         processor=backend.df_engine
     )
     return (
         char_idx2str,
         char_str2idx,
         char_str2freq,
         char_max_len,
         char_pad_idx,
         char_pad_symbol,
         char_unk_symbol,
         word_idx2str,
         word_str2idx,
         word_str2freq,
         word_max_len,
         word_pad_idx,
         word_pad_symbol,
         word_unk_symbol,
     )
Esempio n. 4
0
 def feature_meta(column, preprocessing_parameters):
     (char_idx2str, char_str2idx, char_str2freq,
      char_max_len) = create_vocabulary(
          column,
          'characters',
          num_most_frequent=preprocessing_parameters['char_most_common'],
          lowercase=preprocessing_parameters['lowercase'])
     (word_idx2str, word_str2idx, word_str2freq,
      word_max_len) = create_vocabulary(
          column,
          preprocessing_parameters['word_format'],
          num_most_frequent=preprocessing_parameters['word_most_common'],
          lowercase=preprocessing_parameters['lowercase'])
     return (char_idx2str, char_str2idx, char_str2freq, char_max_len,
             word_idx2str, word_str2idx, word_str2freq, word_max_len)
Esempio n. 5
0
def test_create_vocabulary_word():
    data = pd.DataFrame([
        "Hello, I'm a single sentence!", "And another sentence",
        "And the very very last one"
    ])
    column = data[0]
    preprocessing_parameters = TextFeatureMixin.preprocessing_defaults()

    vocabulary_output = strings_utils.create_vocabulary(
        column,
        tokenizer_type=preprocessing_parameters["word_tokenizer"],
        num_most_frequent=preprocessing_parameters["word_most_common"],
        lowercase=preprocessing_parameters["lowercase"],
        vocab_file=preprocessing_parameters["word_vocab_file"],
        unknown_symbol=preprocessing_parameters["unknown_symbol"],
        padding_symbol=preprocessing_parameters["padding_symbol"],
        pretrained_model_name_or_path=preprocessing_parameters[
            "pretrained_model_name_or_path"],
    )

    assert len(vocabulary_output[0]) == 19
    assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN.
                                value] == strings_utils.UNKNOWN_SYMBOL
    assert vocabulary_output[0][
        strings_utils.SpecialSymbol.STOP.value] == strings_utils.STOP_SYMBOL
    assert vocabulary_output[0][strings_utils.SpecialSymbol.PADDING.
                                value] == strings_utils.PADDING_SYMBOL
    assert vocabulary_output[0][strings_utils.SpecialSymbol.UNKNOWN.
                                value] == strings_utils.UNKNOWN_SYMBOL
Esempio n. 6
0
 def feature_meta(column, preprocessing_parameters, backend):
     (
         idx2str,
         str2idx,
         str2freq,
         max_len,
         max_len_99ptile,
         pad_idx,
         padding_symbol,
         unknown_symbol,
     ) = create_vocabulary(
         column,
         tokenizer_type=preprocessing_parameters["tokenizer"],
         num_most_frequent=preprocessing_parameters["most_common"],
         lowercase=preprocessing_parameters["lowercase"],
         vocab_file=preprocessing_parameters["vocab_file"],
         unknown_symbol=preprocessing_parameters["unknown_symbol"],
         padding_symbol=preprocessing_parameters["padding_symbol"],
         pretrained_model_name_or_path=preprocessing_parameters[
             "pretrained_model_name_or_path"],
         processor=backend.df_engine,
     )
     return (
         idx2str,
         str2idx,
         str2freq,
         max_len,
         max_len_99ptile,
         pad_idx,
         padding_symbol,
         unknown_symbol,
     )
Esempio n. 7
0
 def get_feature_meta(column, preprocessing_parameters, backend):
     column = column.astype(str)
     idx2str, str2idx, str2freq, _, _, _, _ = create_vocabulary(
         column,
         "stripped",
         num_most_frequent=preprocessing_parameters["most_common"],
         lowercase=preprocessing_parameters["lowercase"],
         add_padding=False,
         processor=backend.df_engine,
     )
     return {"idx2str": idx2str, "str2idx": str2idx, "str2freq": str2freq, "vocab_size": len(str2idx)}
Esempio n. 8
0
 def feature_meta(column, preprocessing_parameters):
     (char_idx2str, char_str2idx, char_str2freq,
      char_max_len) = create_vocabulary(
          column,
          tokenizer_type='characters',
          num_most_frequent=preprocessing_parameters['char_most_common'],
          lowercase=preprocessing_parameters['lowercase'],
          unknown_symbol=preprocessing_parameters['unknown_symbol'],
          padding_symbol=preprocessing_parameters['padding_symbol'])
     (word_idx2str, word_str2idx, word_str2freq,
      word_max_len) = create_vocabulary(
          column,
          tokenizer_type=preprocessing_parameters['word_tokenizer'],
          num_most_frequent=preprocessing_parameters['word_most_common'],
          lowercase=preprocessing_parameters['lowercase'],
          vocab_file=preprocessing_parameters['word_vocab_file'],
          unknown_symbol=preprocessing_parameters['unknown_symbol'],
          padding_symbol=preprocessing_parameters['padding_symbol'],
      )
     return (char_idx2str, char_str2idx, char_str2freq, char_max_len,
             word_idx2str, word_str2idx, word_str2freq, word_max_len)
Esempio n. 9
0
 def get_feature_meta(column, preprocessing_parameters):
     idx2str, str2idx, str2freq, max_size = create_vocabulary(
         column,
         preprocessing_parameters['tokenizer'],
         num_most_frequent=preprocessing_parameters['most_common'],
         lowercase=preprocessing_parameters['lowercase'])
     return {
         'idx2str': idx2str,
         'str2idx': str2idx,
         'str2freq': str2freq,
         'vocab_size': len(str2idx),
         'max_set_size': max_size
     }
Esempio n. 10
0
 def get_feature_meta(column, preprocessing_parameters):
     idx2str, str2idx, str2freq, _ = create_vocabulary(
         column,
         'stripped',
         num_most_frequent=preprocessing_parameters['most_common'],
         lowercase=preprocessing_parameters['lowercase'],
         add_padding=False)
     return {
         'idx2str': idx2str,
         'str2idx': str2idx,
         'str2freq': str2freq,
         'vocab_size': len(str2idx)
     }
Esempio n. 11
0
 def get_feature_meta(column, preprocessing_parameters, backend):
     column = column.astype(str)
     idx2str, str2idx, str2freq, _, _, _, _ = create_vocabulary(
         column,
         'stripped',
         num_most_frequent=preprocessing_parameters['most_common'],
         lowercase=preprocessing_parameters['lowercase'],
         add_padding=False,
         processor=backend.df_engine)
     return {
         'idx2str': idx2str,
         'str2idx': str2idx,
         'str2freq': str2freq,
         'vocab_size': len(str2idx)
     }
Esempio n. 12
0
 def get_feature_meta(column, preprocessing_parameters, backend):
     column = column.astype(str)
     idx2str, str2idx, str2freq, max_size, _, _, _ = create_vocabulary(
         column,
         preprocessing_parameters['tokenizer'],
         num_most_frequent=preprocessing_parameters['most_common'],
         lowercase=preprocessing_parameters['lowercase'],
         processor=backend.df_engine)
     return {
         'idx2str': idx2str,
         'str2idx': str2idx,
         'str2freq': str2freq,
         'vocab_size': len(str2idx),
         'max_set_size': max_size
     }
Esempio n. 13
0
 def get_feature_meta(column, preprocessing_parameters):
     idx2str, str2idx, str2freq, max_length = create_vocabulary(
         column,
         preprocessing_parameters['format'],
         lowercase=preprocessing_parameters['lowercase'],
         num_most_frequent=preprocessing_parameters['most_common'])
     max_length = min(preprocessing_parameters['sequence_length_limit'],
                      max_length)
     return {
         'idx2str': idx2str,
         'str2idx': str2idx,
         'str2freq': str2freq,
         'vocab_size': len(idx2str),
         'max_sequence_length': max_length
     }
Esempio n. 14
0
 def get_feature_meta(column, preprocessing_parameters, backend):
     idx2str, str2idx, str2freq, max_size, _, _, _, _ = create_vocabulary(
         column,
         preprocessing_parameters["tokenizer"],
         num_most_frequent=preprocessing_parameters["most_common"],
         lowercase=preprocessing_parameters["lowercase"],
         processor=backend.df_engine,
     )
     return {
         "idx2str": idx2str,
         "str2idx": str2idx,
         "str2freq": str2freq,
         "vocab_size": len(str2idx),
         "max_set_size": max_size,
     }
Esempio n. 15
0
def test_create_vocabulary_from_hf():
    data = pd.DataFrame(["Hello, I'm a single sentence!", "And another sentence", "And the very very last one"])
    column = data[0]
    preprocessing_parameters = TextFeatureMixin.preprocessing_defaults()

    vocabulary_output = strings_utils.create_vocabulary(
        column,
        tokenizer_type="hf_tokenizer",
        num_most_frequent=preprocessing_parameters["most_common"],
        lowercase=preprocessing_parameters["lowercase"],
        unknown_symbol=preprocessing_parameters["unknown_symbol"],
        padding_symbol=preprocessing_parameters["padding_symbol"],
        pretrained_model_name_or_path="albert-base-v2",
    )

    assert len(vocabulary_output[0]) == 30000
Esempio n. 16
0
 def get_feature_meta(column, preprocessing_parameters):
     idx2str, str2idx, str2freq, max_length, _, _, _ = create_vocabulary(
         column,
         preprocessing_parameters['tokenizer'],
         lowercase=preprocessing_parameters['lowercase'],
         num_most_frequent=preprocessing_parameters['most_common'],
         vocab_file=preprocessing_parameters['vocab_file'],
         unknown_symbol=preprocessing_parameters['unknown_symbol'],
         padding_symbol=preprocessing_parameters['padding_symbol'],
     )
     max_length = min(preprocessing_parameters['sequence_length_limit'],
                      max_length)
     return {
         'idx2str': idx2str,
         'str2idx': str2idx,
         'str2freq': str2freq,
         'vocab_size': len(idx2str),
         'max_sequence_length': max_length
     }
Esempio n. 17
0
 def get_feature_meta(column, preprocessing_parameters, backend):
     idx2str, str2idx, str2freq, max_length, _, _, _, _ = create_vocabulary(
         column,
         preprocessing_parameters["tokenizer"],
         lowercase=preprocessing_parameters["lowercase"],
         num_most_frequent=preprocessing_parameters["most_common"],
         vocab_file=preprocessing_parameters["vocab_file"],
         unknown_symbol=preprocessing_parameters["unknown_symbol"],
         padding_symbol=preprocessing_parameters["padding_symbol"],
         processor=backend.df_engine,
     )
     max_length = min(preprocessing_parameters["max_sequence_length"],
                      max_length)
     return {
         "idx2str": idx2str,
         "str2idx": str2idx,
         "str2freq": str2freq,
         "vocab_size": len(idx2str),
         "max_sequence_length": max_length + 2,  # For start and end symbol.
     }