def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table):
    factory.create_tokenizer(test_config)

    test_config.project_dir = test_config.project_dir / 'foo'

    with pytest.raises(UsageError):
        factory.get_tokenizer_for_db(test_config)
Ejemplo n.º 2
0
    def test_load_missing_property(self, temp_db_cursor):
        factory.create_tokenizer(self.config)

        temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")

        with pytest.raises(UsageError):
            factory.get_tokenizer_for_db(self.config)
Ejemplo n.º 3
0
    def test_load_no_tokenizer_dir(self):
        factory.create_tokenizer(self.config)

        self.config.project_dir = self.config.project_dir / 'foo'

        with pytest.raises(UsageError):
            factory.get_tokenizer_for_db(self.config)
def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock,
                                property_table):
    factory.create_tokenizer(test_config)

    temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties")

    with pytest.raises(UsageError):
        factory.get_tokenizer_for_db(test_config)
Ejemplo n.º 5
0
def check_word_table_for_postcodes(context, exclude, postcodes):
    """ Check that the tokenizer produces postcode tokens for the given
        postcodes. The postcodes are a comma-separated list of postcodes.
        Whitespace matters.
    """
    nctx = context.nominatim
    tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config())
    with tokenizer.name_analyzer() as ana:
        plist = [ana.normalize_postcode(p) for p in postcodes.split(',')]

    plist.sort()

    with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
        if nctx.tokenizer == 'icu':
            cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)",
                        (plist,))
        else:
            cur.execute("""SELECT word FROM word WHERE word = any(%s)
                             and class = 'place' and type = 'postcode'""",
                        (plist,))

        found = [row[0] for row in cur]
        assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}"

    if exclude:
        assert len(found) == 0, f"Unexpected postcodes: {found}"
    else:
        assert set(found) == set(plist), \
        f"Missing postcodes {set(plist) - set(found)}. Found: {found}"
Ejemplo n.º 6
0
def test_load_tokenizer(test_config):
    factory.create_tokenizer(test_config)

    tokenizer = factory.get_tokenizer_for_db(test_config)

    assert isinstance(tokenizer, DummyTokenizer)
    assert tokenizer.init_state == "loaded"
Ejemplo n.º 7
0
    def run(args):
        from nominatim.tokenizer import factory as tokenizer_factory
        from nominatim.tools import tiger_data, add_osm_data

        if args.tiger_data:
            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
            return tiger_data.add_tiger_data(
                args.tiger_data, args.config, args.threads
                or psutil.cpu_count() or 1, tokenizer)

        osm2pgsql_params = args.osm2pgsql_options(default_cache=1000,
                                                  default_threads=1)
        if args.file or args.diff:
            return add_osm_data.add_data_from_file(args.file or args.diff,
                                                   osm2pgsql_params)

        if args.node:
            return add_osm_data.add_osm_object('node', args.node,
                                               args.use_main_api,
                                               osm2pgsql_params)

        if args.way:
            return add_osm_data.add_osm_object('way', args.way,
                                               args.use_main_api,
                                               osm2pgsql_params)

        if args.relation:
            return add_osm_data.add_osm_object('relation', args.relation,
                                               args.use_main_api,
                                               osm2pgsql_params)

        return 0
def test_load_tokenizer(temp_db_conn, test_config, tokenizer_mock,
                        property_table):
    factory.create_tokenizer(test_config)

    tokenizer = factory.get_tokenizer_for_db(test_config)

    assert isinstance(tokenizer, DummyTokenizer)
    assert tokenizer.init_state == "loaded"
Ejemplo n.º 9
0
def check_tokenizer(_, config):
    """ Checking that tokenizer works
    """
    try:
        tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
    except UsageError:
        return CheckState.FAIL, dict(msg="""\
            Cannot load tokenizer. Did the import finish sucessfully?""")

    result = tokenizer.check_database()

    if result is None:
        return CheckState.OK

    return CheckState.FAIL, dict(msg=result)
Ejemplo n.º 10
0
def migrate(config, paths):
    """ Check for the current database version and execute migrations,
        if necesssary.
    """
    with connect(config.get_libpq_dsn()) as conn:
        if conn.table_exists('nominatim_properties'):
            db_version_str = properties.get_property(conn, 'database_version')
        else:
            db_version_str = None

        if db_version_str is not None:
            parts = db_version_str.split('.')
            db_version = tuple(
                [int(x) for x in parts[:2] + parts[2].split('-')])

            if db_version == NOMINATIM_VERSION:
                LOG.warning("Database already at latest version (%s)",
                            db_version_str)
                return 0

            LOG.info("Detected database version: %s", db_version_str)
        else:
            db_version = _guess_version(conn)

        has_run_migration = False
        for version, func in _MIGRATION_FUNCTIONS:
            if db_version <= version:
                LOG.warning("Runnning: %s (%s)",
                            func.__doc__.split('\n', 1)[0],
                            '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version))
                kwargs = dict(conn=conn, config=config, paths=paths)
                func(**kwargs)
                conn.commit()
                has_run_migration = True

        if has_run_migration:
            LOG.warning('Updating SQL functions.')
            refresh.create_functions(conn, config)
            tokenizer = tokenizer_factory.get_tokenizer_for_db(config)
            tokenizer.update_sql_functions(config)

        properties.set_property(
            conn, 'database_version',
            '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION))

        conn.commit()

    return 0
Ejemplo n.º 11
0
def check_search_name_contents(context, exclude):
    """ Check contents of place/placex tables. Each row represents a table row
        and all data must match. Data not present in the expected table, may
        be arbitry. The rows are identified via the 'object' column which must
        have an identifier of the form '<NRW><osm id>[:<class>]'. All
        expected rows are expected to be present with at least one database row.
    """
    tokenizer = tokenizer_factory.get_tokenizer_for_db(
        context.nominatim.get_test_config())

    with tokenizer.name_analyzer() as analyzer:
        with context.db.cursor(
                cursor_factory=psycopg2.extras.DictCursor) as cur:
            for row in context.table:
                nid = NominatimID(row['object'])
                nid.row_by_place_id(
                    cur, 'search_name',
                    ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy'])
                assert cur.rowcount > 0, "No rows found for " + row['object']

                for res in cur:
                    db_row = DBRow(nid, res, context)
                    for name, value in zip(row.headings, row.cells):
                        if name in ('name_vector', 'nameaddress_vector'):
                            items = [x.strip() for x in value.split(',')]
                            tokens = analyzer.get_word_token_info(
                                context.db, items)

                            if not exclude:
                                assert len(tokens) >= len(items), \
                                       "No word entry found for {}. Entries found: {!s}".format(value, len(tokens))
                            for word, token, wid in tokens:
                                if exclude:
                                    assert wid not in res[name], \
                                           "Found term for {}/{}: {}".format(nid, name, wid)
                                else:
                                    assert wid in res[name], \
                                           "Missing term for {}/{}: {}".format(nid, name, wid)
                        elif name != 'object':
                            assert db_row.contains(name,
                                                   value), db_row.assert_msg(
                                                       name, value)