Esempio n. 1
0
  def test_all_replacements(self):
    input_spec = {'text': lit_types.TextSegment()}
    model = testing_utils.TestRegressionModel(input_spec)
    # Dataset is only used for spec in word_replacer so define once
    dataset = lit_dataset.Dataset(input_spec, [{'text': 'blank'}])

    ## Test replacements
    generator = word_replacer.WordReplacer()
    # Unicode to Unicode
    input_dict = {'text': '♞ is a black chess knight.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: '♞ -> ♟',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': '♟ is a black chess knight.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Unicode to ASCII
    input_dict = {'text': 'Is répertoire a unicode word?'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'répertoire -> repertoire',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'Is repertoire a unicode word?'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Ignore capitalization
    input_dict = {'text': 'Capitalization is ignored.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'Capitalization -> blank',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'blank is ignored.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    input_dict = {'text': 'Capitalization is ignored.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'capitalization -> blank',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'blank is ignored.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Do not Ignore capitalization
    input_dict = {'text': 'Capitalization is important.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'Capitalization -> blank',
        word_replacer.IGNORE_CASING_KEY: False,
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'blank is important.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    input_dict = {'text': 'Capitalization is important.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'capitalization -> blank',
        word_replacer.IGNORE_CASING_KEY: False,
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = []
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Repetition
    input_dict = {'text': 'maybe repetition repetition maybe'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'repetition -> blank',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'maybe blank repetition maybe'},
                {'text': 'maybe repetition blank maybe'}]
    self.assertCountEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # No partial match
    input_dict = {'text': 'A catastrophic storm'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'cat -> blank',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = []
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    ## Special characters
    # Punctuation
    input_dict = {'text': 'A catastrophic storm .'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: '. -> -',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'A catastrophic storm -'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    input_dict = {'text': 'A.catastrophic. storm'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: '. -> -',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'A-catastrophic. storm'},
                {'text': 'A.catastrophic- storm'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    input_dict = {'text': 'A...catastrophic.... storm'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: '.. -> --',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'A--.catastrophic.... storm'},
                {'text': 'A...catastrophic--.. storm'},
                {'text': 'A...catastrophic..-- storm'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Underscore
    input_dict = {'text': 'A catastrophic_storm is raging.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'catastrophic_storm -> nice_storm',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'A nice_storm is raging.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Deletion
    input_dict = {'text': 'A storm is raging.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'storm -> ',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'A  is raging.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Word next to punctuation and words with punctuation.
    input_dict = {'text': 'It`s raining cats and dogs.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'dogs -> blank',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'It`s raining cats and blank.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Multiple target tokens.
    input_dict = {'text': 'It`s raining cats and dogs.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'dogs -> horses|donkeys',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'It`s raining cats and horses.'},
                {'text': 'It`s raining cats and donkeys.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    ## Test default_replacements applied at init.
    replacements = {'tree': ['car']}
    generator = word_replacer.WordReplacer(replacements=replacements)
    input_dict = {'text': 'black truck hit the tree'}
    expected = [{'text': 'black truck hit the car'}]
    config_dict = {
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }

    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    ## Test not passing replacements not breaking.
    generator = word_replacer.WordReplacer()
    input_dict = {'text': 'xyz yzy zzz.'}
    expected = []

    self.assertEqual(
        generator.generate(input_dict, model, dataset), expected)

    # Multi word match.
    input_dict = {'text': 'A red cat is coming.'}
    config_dict = {
        word_replacer.SUBSTITUTIONS_KEY: 'red cat -> black dog',
        word_replacer.FIELDS_TO_REPLACE_KEY: ['text'],
    }
    expected = [{'text': 'A black dog is coming.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)
Esempio n. 2
0
  def test_all_replacements(self):
    input_spec = {'text': lit_types.TextSegment()}
    model = testing_utils.TestRegressionModel(input_spec)
    # Dataset is only used for spec in word_replacer so define once
    dataset = lit_dataset.Dataset(input_spec, {'text': 'blank'})

    ## Test replacements
    generator = word_replacer.WordReplacer()
    # Unicode to Unicode
    input_dict = {'text': '♞ is a black chess knight.'}
    config_dict = {'subs': '♞ -> ♟'}
    expected = [{'text': '♟ is a black chess knight.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Unicode to ASCII
    input_dict = {'text': 'Is répertoire a unicode word?'}
    config_dict = {'subs': 'répertoire -> repertoire'}
    expected = [{'text': 'Is repertoire a unicode word?'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Capitalization
    input_dict = {'text': 'Capitalization is important.'}
    config_dict = {'subs': 'Capitalization -> blank'}
    expected = [{'text': 'blank is important.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    input_dict = {'text': 'Capitalization is important.'}
    config_dict = {'subs': 'capitalization -> blank'}
    expected = []
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Repetition
    input_dict = {'text': 'maybe repetition repetition maybe'}
    config_dict = {'subs': 'repetition -> blank'}
    expected = [{'text': 'maybe blank repetition maybe'},
                {'text': 'maybe repetition blank maybe'}]
    self.assertCountEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # No partial match
    input_dict = {'text': 'A catastrophic storm'}
    config_dict = {'subs': 'cat -> blank'}
    expected = []
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    ## Special characters
    # Punctuation
    input_dict = {'text': 'A catastrophic storm .'}
    config_dict = {'subs': '. -> -'}
    expected = [{'text': 'A catastrophic storm -'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Underscore
    input_dict = {'text': 'A catastrophic_storm is raging.'}
    config_dict = {'subs': 'catastrophic_storm -> nice_storm'}
    expected = [{'text': 'A nice_storm is raging.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    # Word next to punctuation and words with punctuation.
    input_dict = {'text': 'It`s raining cats and dogs.'}
    config_dict = {'subs': 'dogs -> blank'}
    expected = [{'text': 'It`s raining cats and blank.'}]
    self.assertEqual(
        generator.generate(input_dict, model, dataset, config=config_dict),
        expected)

    ## Test default_replacements applied at init.
    replacements = {'tree': 'car'}
    generator = word_replacer.WordReplacer(replacements=replacements)
    input_dict = {'text': 'black truck hit the tree'}
    expected = [{'text': 'black truck hit the car'}]

    self.assertEqual(
        generator.generate(input_dict, model, dataset), expected)

    ## Test not passing replacements not breaking.
    generator = word_replacer.WordReplacer()
    input_dict = {'text': 'xyz yzy zzz.'}
    expected = []

    self.assertEqual(
        generator.generate(input_dict, model, dataset), expected)