def test_load_no_tokenizer_dir(test_config, tokenizer_mock, property_table): factory.create_tokenizer(test_config) test_config.project_dir = test_config.project_dir / 'foo' with pytest.raises(UsageError): factory.get_tokenizer_for_db(test_config)
def test_load_missing_property(self, temp_db_cursor): factory.create_tokenizer(self.config) temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties") with pytest.raises(UsageError): factory.get_tokenizer_for_db(self.config)
def test_load_no_tokenizer_dir(self): factory.create_tokenizer(self.config) self.config.project_dir = self.config.project_dir / 'foo' with pytest.raises(UsageError): factory.get_tokenizer_for_db(self.config)
def test_load_missing_propoerty(temp_db_cursor, test_config, tokenizer_mock, property_table): factory.create_tokenizer(test_config) temp_db_cursor.execute("TRUNCATE TABLE nominatim_properties") with pytest.raises(UsageError): factory.get_tokenizer_for_db(test_config)
def check_word_table_for_postcodes(context, exclude, postcodes): """ Check that the tokenizer produces postcode tokens for the given postcodes. The postcodes are a comma-separated list of postcodes. Whitespace matters. """ nctx = context.nominatim tokenizer = tokenizer_factory.get_tokenizer_for_db(nctx.get_test_config()) with tokenizer.name_analyzer() as ana: plist = [ana.normalize_postcode(p) for p in postcodes.split(',')] plist.sort() with context.db.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur: if nctx.tokenizer == 'icu': cur.execute("SELECT word FROM word WHERE type = 'P' and word = any(%s)", (plist,)) else: cur.execute("""SELECT word FROM word WHERE word = any(%s) and class = 'place' and type = 'postcode'""", (plist,)) found = [row[0] for row in cur] assert len(found) == len(set(found)), f"Duplicate rows for postcodes: {found}" if exclude: assert len(found) == 0, f"Unexpected postcodes: {found}" else: assert set(found) == set(plist), \ f"Missing postcodes {set(plist) - set(found)}. Found: {found}"
def test_load_tokenizer(test_config): factory.create_tokenizer(test_config) tokenizer = factory.get_tokenizer_for_db(test_config) assert isinstance(tokenizer, DummyTokenizer) assert tokenizer.init_state == "loaded"
def run(args): from nominatim.tokenizer import factory as tokenizer_factory from nominatim.tools import tiger_data, add_osm_data if args.tiger_data: tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config) return tiger_data.add_tiger_data( args.tiger_data, args.config, args.threads or psutil.cpu_count() or 1, tokenizer) osm2pgsql_params = args.osm2pgsql_options(default_cache=1000, default_threads=1) if args.file or args.diff: return add_osm_data.add_data_from_file(args.file or args.diff, osm2pgsql_params) if args.node: return add_osm_data.add_osm_object('node', args.node, args.use_main_api, osm2pgsql_params) if args.way: return add_osm_data.add_osm_object('way', args.way, args.use_main_api, osm2pgsql_params) if args.relation: return add_osm_data.add_osm_object('relation', args.relation, args.use_main_api, osm2pgsql_params) return 0
def test_load_tokenizer(temp_db_conn, test_config, tokenizer_mock, property_table): factory.create_tokenizer(test_config) tokenizer = factory.get_tokenizer_for_db(test_config) assert isinstance(tokenizer, DummyTokenizer) assert tokenizer.init_state == "loaded"
def check_tokenizer(_, config): """ Checking that tokenizer works """ try: tokenizer = tokenizer_factory.get_tokenizer_for_db(config) except UsageError: return CheckState.FAIL, dict(msg="""\ Cannot load tokenizer. Did the import finish sucessfully?""") result = tokenizer.check_database() if result is None: return CheckState.OK return CheckState.FAIL, dict(msg=result)
def migrate(config, paths): """ Check for the current database version and execute migrations, if necesssary. """ with connect(config.get_libpq_dsn()) as conn: if conn.table_exists('nominatim_properties'): db_version_str = properties.get_property(conn, 'database_version') else: db_version_str = None if db_version_str is not None: parts = db_version_str.split('.') db_version = tuple( [int(x) for x in parts[:2] + parts[2].split('-')]) if db_version == NOMINATIM_VERSION: LOG.warning("Database already at latest version (%s)", db_version_str) return 0 LOG.info("Detected database version: %s", db_version_str) else: db_version = _guess_version(conn) has_run_migration = False for version, func in _MIGRATION_FUNCTIONS: if db_version <= version: LOG.warning("Runnning: %s (%s)", func.__doc__.split('\n', 1)[0], '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(version)) kwargs = dict(conn=conn, config=config, paths=paths) func(**kwargs) conn.commit() has_run_migration = True if has_run_migration: LOG.warning('Updating SQL functions.') refresh.create_functions(conn, config) tokenizer = tokenizer_factory.get_tokenizer_for_db(config) tokenizer.update_sql_functions(config) properties.set_property( conn, 'database_version', '{0[0]}.{0[1]}.{0[2]}-{0[3]}'.format(NOMINATIM_VERSION)) conn.commit() return 0
def check_search_name_contents(context, exclude): """ Check contents of place/placex tables. Each row represents a table row and all data must match. Data not present in the expected table, may be arbitry. The rows are identified via the 'object' column which must have an identifier of the form '<NRW><osm id>[:<class>]'. All expected rows are expected to be present with at least one database row. """ tokenizer = tokenizer_factory.get_tokenizer_for_db( context.nominatim.get_test_config()) with tokenizer.name_analyzer() as analyzer: with context.db.cursor( cursor_factory=psycopg2.extras.DictCursor) as cur: for row in context.table: nid = NominatimID(row['object']) nid.row_by_place_id( cur, 'search_name', ['ST_X(centroid) as cx', 'ST_Y(centroid) as cy']) assert cur.rowcount > 0, "No rows found for " + row['object'] for res in cur: db_row = DBRow(nid, res, context) for name, value in zip(row.headings, row.cells): if name in ('name_vector', 'nameaddress_vector'): items = [x.strip() for x in value.split(',')] tokens = analyzer.get_word_token_info( context.db, items) if not exclude: assert len(tokens) >= len(items), \ "No word entry found for {}. Entries found: {!s}".format(value, len(tokens)) for word, token, wid in tokens: if exclude: assert wid not in res[name], \ "Found term for {}/{}: {}".format(nid, name, wid) else: assert wid in res[name], \ "Missing term for {}/{}: {}".format(nid, name, wid) elif name != 'object': assert db_row.contains(name, value), db_row.assert_msg( name, value)