Example #1
0
    def init_from_project(self, config):
        """ Initialise the tokenizer from the project directory.
        """
        self.loader = ICURuleLoader(config)

        with connect(self.dsn) as conn:
            self.loader.load_config_from_db(conn)
Example #2
0
    def test_get_transliteration_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" проспект-Prospekt ") == " prospekt Prospekt "
Example #3
0
    def test_empty_rule_set(self):
        self.write_config("""\
            normalization:
            transliteration:
            token-analysis:
              - analyzer: generic
                variants:
            """)

        rules = ICURuleLoader(self.project_env)
        assert rules.get_search_rules() == ''
        assert rules.get_normalization_rules() == ''
        assert rules.get_transliteration_rules() == ''
Example #4
0
def test_get_search_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())

    rules = loader.get_search_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" Baum straße ") == " baum straße "
    assert trans.transliterate(" Baumstraße ") == " baumstraße "
    assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
    assert trans.transliterate(" Baumstr ") == " baumstr "
    assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
    assert trans.transliterate(" Αθήνα ") == " athēna "
    assert trans.transliterate(" проспект ") == " prospekt "
Example #5
0
def test_empty_rule_set(test_config):
    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(
        dedent("""\
        normalization:
        transliteration:
        token-analysis:
          - analyzer: generic
            variants:
        """))

    rules = ICURuleLoader(test_config)
    assert rules.get_search_rules() == ''
    assert rules.get_normalization_rules() == ''
    assert rules.get_transliteration_rules() == ''
Example #6
0
    def init_new_db(self, config, init_db=True):
        """ Set up a new tokenizer for the database.

            This copies all necessary data in the project directory to make
            sure the tokenizer remains stable even over updates.
        """
        self.loader = ICURuleLoader(config)

        self._install_php(config.lib_dir.php)
        self._save_config()

        if init_db:
            self.update_sql_functions(config)
            self._init_db_tables(config)
Example #7
0
    def test_get_search_rules(self):
        self.config_rules()
        loader = ICURuleLoader(self.project_env)

        rules = loader.get_search_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" Baum straße ") == " baum straße "
        assert trans.transliterate(" Baumstraße ") == " baumstraße "
        assert trans.transliterate(" Baumstrasse ") == " baumstrasse "
        assert trans.transliterate(" Baumstr ") == " baumstr "
        assert trans.transliterate(" Baumwegstr ") == " baumwegstr "
        assert trans.transliterate(" Αθήνα ") == " athēna "
        assert trans.transliterate(" проспект ") == " prospekt "
Example #8
0
def test_missing_section(section, test_config):
    rule_cfg = {s: [] for s in CONFIG_SECTIONS if s != section}
    (test_config.project_dir / 'icu_tokenizer.yaml').write_text(
        yaml.dump(rule_cfg))

    with pytest.raises(UsageError):
        ICURuleLoader(test_config)
Example #9
0
    def test_search_rules(self):
        self.config_rules('~street => s,st', 'master => mstr')
        proc = ICURuleLoader(self.project_env).make_token_analysis()

        assert proc.search.transliterate('Master Street').strip() == 'master street'
        assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
        assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
Example #10
0
def test_search_rules(cfgrules):
    config = cfgrules('~street => s,st', 'master => mstr')
    proc = ICURuleLoader(config).make_token_analysis()

    assert proc.search.transliterate(
        'Master Street').strip() == 'master street'
    assert proc.search.transliterate('Earnes St').strip() == 'earnes st'
    assert proc.search.transliterate('Nostreet').strip() == 'nostreet'
Example #11
0
    def test_transliteration_rules_from_file(self):
        self.write_config("""\
            normalization:
            transliteration:
                - "'ax' > 'b'"
                - !include transliteration.yaml
            token-analysis:
                - analyzer: generic
                  variants:
            """)
        transpath = self.project_env.project_dir / ('transliteration.yaml')
        transpath.write_text('- "x > y"')

        loader = ICURuleLoader(self.project_env)
        rules = loader.get_transliteration_rules()
        trans = Transliterator.createFromRules("test", rules)

        assert trans.transliterate(" axxt ") == " byt "
Example #12
0
def test_transliteration_rules_from_file(test_config):
    cfgpath = test_config.project_dir / ('icu_tokenizer.yaml')
    cfgpath.write_text(
        dedent("""\
        normalization:
        transliteration:
            - "'ax' > 'b'"
            - !include transliteration.yaml
        token-analysis:
            - analyzer: generic
              variants:
        """))
    transpath = test_config.project_dir / ('transliteration.yaml')
    transpath.write_text('- "x > y"')

    loader = ICURuleLoader(test_config)
    rules = loader.get_transliteration_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" axxt ") == " byt "
Example #13
0
def test_get_normalization_rules(cfgrules):
    loader = ICURuleLoader(cfgrules())
    rules = loader.get_normalization_rules()
    trans = Transliterator.createFromRules("test", rules)

    assert trans.transliterate(" проспект-Prospekt ") == " проспект prospekt "
Example #14
0
 def test_invalid_variant_description(self, variant):
     with pytest.raises(UsageError):
         ICURuleLoader(self.cfgrules(variant))
Example #15
0
    def get_replacements(self, *variants):
        loader = ICURuleLoader(self.cfgrules(*variants))
        rules = loader.analysis[None].config['replacements']

        return sorted((k, sorted(v)) for k, v in rules)
Example #16
0
class LegacyICUTokenizer(AbstractTokenizer):
    """ This tokenizer uses libICU to covert names and queries to ASCII.
        Otherwise it uses the same algorithms and data structures as the
        normalization routines in Nominatim 3.
    """
    def __init__(self, dsn, data_dir):
        self.dsn = dsn
        self.data_dir = data_dir
        self.loader = None

    def init_new_db(self, config, init_db=True):
        """ Set up a new tokenizer for the database.

            This copies all necessary data in the project directory to make
            sure the tokenizer remains stable even over updates.
        """
        self.loader = ICURuleLoader(config)

        self._install_php(config.lib_dir.php)
        self._save_config()

        if init_db:
            self.update_sql_functions(config)
            self._init_db_tables(config)

    def init_from_project(self, config):
        """ Initialise the tokenizer from the project directory.
        """
        self.loader = ICURuleLoader(config)

        with connect(self.dsn) as conn:
            self.loader.load_config_from_db(conn)

    def finalize_import(self, config):
        """ Do any required postprocessing to make the tokenizer data ready
            for use.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')

    def update_sql_functions(self, config):
        """ Reimport the SQL functions for this tokenizer.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer.sql')

    def check_database(self, config):
        """ Check that the tokenizer is set up correctly.
        """
        # Will throw an error if there is an issue.
        self.init_from_project(config)

    def update_statistics(self):
        """ Recompute frequencies for all name words.
        """
        with connect(self.dsn) as conn:
            if conn.table_exists('search_name'):
                with conn.cursor() as cur:
                    cur.drop_table("word_frequencies")
                    LOG.info("Computing word frequencies")
                    cur.execute("""CREATE TEMP TABLE word_frequencies AS
                                     SELECT unnest(name_vector) as id, count(*)
                                     FROM search_name GROUP BY id""")
                    cur.execute("CREATE INDEX ON word_frequencies(id)")
                    LOG.info("Update word table with recomputed frequencies")
                    cur.execute("""UPDATE word
                                   SET info = info || jsonb_build_object('count', count)
                                   FROM word_frequencies WHERE word_id = id""")
                    cur.drop_table("word_frequencies")
            conn.commit()

    def name_analyzer(self):
        """ Create a new analyzer for tokenizing names and queries
            using this tokinzer. Analyzers are context managers and should
            be used accordingly:

            ```
            with tokenizer.name_analyzer() as analyzer:
                analyser.tokenize()
            ```

            When used outside the with construct, the caller must ensure to
            call the close() function before destructing the analyzer.

            Analyzers are not thread-safe. You need to instantiate one per thread.
        """
        return LegacyICUNameAnalyzer(self.dsn, self.loader.make_sanitizer(),
                                     self.loader.make_token_analysis())

    def _install_php(self, phpdir):
        """ Install the php script for the tokenizer.
        """
        php_file = self.data_dir / "tokenizer.php"
        php_file.write_text(
            dedent(f"""\
            <?php
            @define('CONST_Max_Word_Frequency', 10000000);
            @define('CONST_Term_Normalization_Rules', "{self.loader.normalization_rules}");
            @define('CONST_Transliteration', "{self.loader.get_search_rules()}");
            require_once('{phpdir}/tokenizer/icu_tokenizer.php');"""))

    def _save_config(self):
        """ Save the configuration that needs to remain stable for the given
            database as database properties.
        """
        with connect(self.dsn) as conn:
            self.loader.save_config_to_db(conn)

    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
            conn.commit()
Example #17
0
    def test_missing_section(self, section):
        rule_cfg = { s: [] for s in CONFIG_SECTIONS if s != section}
        self.write_config(yaml.dump(rule_cfg))

        with pytest.raises(UsageError):
            ICURuleLoader(self.project_env)
Example #18
0
 def test_invalid_variant_description(self, variant):
     self.config_rules(variant)
     with pytest.raises(UsageError):
         ICURuleLoader(self.project_env)