Example #1
0
    def test_search_rules(self):
        self.config_rules('~street => s,st', 'master => mstr')
        proc = ICURuleLoader(self.project_env).make_token_analysis()

        assert proc.search.transliterate('Master Street').strip() == 'master street'
        assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
        assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
Example #2
0
    def init_from_project(self, config):
        """ Initialise the tokenizer from the project directory.
        """
        self.loader = ICURuleLoader(config)

        with connect(self.dsn) as conn:
            self.loader.load_config_from_db(conn)
Example #3
0
    def test_get_transliteration_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
Example #4
0
def test_missing_section(section, test_config):
    rule_cfg = {s: [] for s in CONFIG_SECTIONS if s != section}
    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(
        yaml.dump(rule_cfg))

    with pytest.raises(UsageError):
        ICURuleLoader(test_config)
Example #5
0
def test_search_rules(cfgrules):
    config = cfgrules('~street => s,st', 'master => mstr')
    proc = ICURuleLoader(config).make_token_analysis()

    assert proc.search.transliterate(
        'Master Street').strip() == 'master street'
    assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
    assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
Example #6
0
def test_get_search_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())

    rules = loader.get_search_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" Baum straße ") == " baum straße "
    assert trans.transliterate(" Baumstraße ") == " baumstraße "
    assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
    assert trans.transliterate(" Baumstr ") == " baumstr "
    assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
    assert trans.transliterate(" Αθήνα ") == " athēna "
    assert trans.transliterate(" проспект ") == " prospekt "
Example #7
0
    def test_empty_rule_set(self):
        self.write_config("""\
            normalization:
            transliteration:
            token-analysis:
              - analyzer: generic
                variants:
            """)

        rules = ICURuleLoader(self.project_env)
        assert rules.get_search_rules() == ''
        assert rules.get_normalization_rules() == ''
        assert rules.get_transliteration_rules() == ''
Example #8
0
def test_empty_rule_set(test_config):
    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(
        dedent("""\
        normalization:
        transliteration:
        token-analysis:
          - analyzer: generic
            variants:
        """))

    rules = ICURuleLoader(test_config)
    assert rules.get_search_rules() == ''
    assert rules.get_normalization_rules() == ''
    assert rules.get_transliteration_rules() == ''
Example #9
0
    def init_new_db(self, config, init_db=True):
        """ Set up a new tokenizer for the database.

            This copies all necessary data in the project directory to make
            sure the tokenizer remains stable even over updates.
        """
        self.loader = ICURuleLoader(config)

        self._install_php(config.lib_dir.php)
        self._save_config()

        if init_db:
            self.update_sql_functions(config)
            self._init_db_tables(config)
Example #10
0
    def test_get_search_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)

        rules = loader.get_search_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" Baum straße ") == " baum straße "
        assert trans.transliterate(" Baumstraße ") == " baumstraße "
        assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
        assert trans.transliterate(" Baumstr ") == " baumstr "
        assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
        assert trans.transliterate(" Αθήνα ") == " athēna "
        assert trans.transliterate(" проспект ") == " prospekt "
Example #11
0
    def test_transliteration_rules_from_file(self):
        self.write_config("""\
            normalization:
            transliteration:
                - "'ax' > 'b'"
                - !include transliteration.yaml
            token-analysis:
                - analyzer: generic
                  variants:
            """)
        transpath = self.project_env.project_dir / ('transliteration.yaml')
        transpath.write_text('- "x > y"')

        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" axxt ") == " byt "
Example #12
0
def test_transliteration_rules_from_file(test_config):
    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
    cfgpath.write_text(
        dedent("""\
        normalization:
        transliteration:
            - "'ax' > 'b'"
            - !include transliteration.yaml
        token-analysis:
            - analyzer: generic
              variants:
        """))
    transpath = test_config.project_dir / ('transliteration.yaml')
    transpath.write_text('- "x > y"')

    loader = ICURuleLoader(test_config)
    rules = loader.get_transliteration_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" axxt ") == " byt "
Example #13
0
def test_get_normalization_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())
    rules = loader.get_normalization_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
Example #14
0
 def test_invalid_variant_description(self, variant):
     with pytest.raises(UsageError):
         ICURuleLoader(self.cfgrules(variant))
Example #15
0
    def get_replacements(self, *variants):
        loader = ICURuleLoader(self.cfgrules(*variants))
        rules = loader.analysis[None].config['replacements']

        return sorted((k, sorted(v)) for k, v in rules)
Example #16
0
    def test_missing_section(self, section):
        rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
        self.write_config(yaml.dump(rule_cfg))

        with pytest.raises(UsageError):
            ICURuleLoader(self.project_env)
Example #17
0
 def test_invalid_variant_description(self, variant):
     self.config_rules(variant)
     with pytest.raises(UsageError):
         ICURuleLoader(self.project_env)