Ejemplo n.º 1
0
    def run(self, args):
        from ..tools import refresh, postcodes
        from ..indexer.indexer import Indexer

        if args.postcodes:
            if postcodes.can_compute(args.config.get_libpq_dsn()):
                LOG.warning("Update postcodes centroid")
                tokenizer = self._get_tokenizer(args.config)
                postcodes.update_postcodes(args.config.get_libpq_dsn(),
                                           args.project_dir, tokenizer)
                indexer = Indexer(args.config.get_libpq_dsn(), tokenizer,
                                  args.threads or 1)
                indexer.index_postcodes()
            else:
                LOG.error(
                    "The place table doesn't exist. "
                    "Postcode updates on a frozen database is not possible.")

        if args.word_counts:
            LOG.warning('Recompute word statistics')
            self._get_tokenizer(args.config).update_statistics()

        if args.address_levels:
            LOG.warning('Updating address levels')
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.load_address_levels_from_config(conn, args.config)

        if args.functions:
            LOG.warning('Create functions')
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.create_functions(conn, args.config, args.diffs,
                                         args.enable_debug_statements)
                self._get_tokenizer(args.config).update_sql_functions(
                    args.config)

        if args.wiki_data:
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
                             or args.project_dir)
            LOG.warning('Import wikipdia article importance from %s',
                        data_path)
            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
                                                 data_path) > 0:
                LOG.fatal('FATAL: Wikipedia importance dump file not found')
                return 1

        # Attention: importance MUST come after wiki data import.
        if args.importance:
            LOG.warning('Update importance values for database')
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.recompute_importance(conn)

        if args.website:
            webdir = args.project_dir / 'website'
            LOG.warning('Setting up website directory at %s', webdir)
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.setup_website(webdir, args.config, conn)

        return 0
Ejemplo n.º 2
0
    def _update(args):
        from ..tools import replication
        from ..indexer.indexer import Indexer

        params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
        params.update(base_url=args.config.REPLICATION_URL,
                      update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'),
                      import_file=args.project_dir / 'osmosischange.osc',
                      max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
                      indexed_only=not args.once)

        # Sanity check to not overwhelm the Geofabrik servers.
        if 'download.geofabrik.de'in params['base_url']\
           and params['update_interval'] < 86400:
            LOG.fatal("Update interval too low for download.geofabrik.de.\n"
                      "Please check install documentation "
                      "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
                      "setting-up-the-update-process).")
            raise UsageError("Invalid replication update interval setting.")

        if not args.once:
            if not args.do_index:
                LOG.fatal("Indexing cannot be disabled when running updates continuously.")
                raise UsageError("Bad argument '--no-index'.")
            recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')

        while True:
            with connect(args.config.get_libpq_dsn()) as conn:
                start = dt.datetime.now(dt.timezone.utc)
                state = replication.update(conn, params)
                if state is not replication.UpdateState.NO_CHANGES:
                    status.log_status(conn, start, 'import')
                batchdate, _, _ = status.get_status(conn)

            if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                index_start = dt.datetime.now(dt.timezone.utc)
                indexer = Indexer(args.config.get_libpq_dsn(),
                                  args.threads or 1)
                indexer.index_boundaries(0, 30)
                indexer.index_by_rank(0, 30)

                with connect(args.config.get_libpq_dsn()) as conn:
                    status.set_indexed(conn, True)
                    status.log_status(conn, index_start, 'index')
            else:
                index_start = None

            if LOG.isEnabledFor(logging.WARNING):
                UpdateReplication._report_update(batchdate, start, index_start)

            if args.once:
                break

            if state is replication.UpdateState.NO_CHANGES:
                LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval)
                time.sleep(recheck_interval)
Ejemplo n.º 3
0
    def run(args):
        from ..tools import refresh
        from ..tokenizer import factory as tokenizer_factory

        if args.postcodes:
            LOG.warning("Update postcodes centroid")
            refresh.update_postcodes(args.config.get_libpq_dsn(),
                                     args.sqllib_dir)

        if args.word_counts:
            LOG.warning('Recompute frequency of full-word search terms')
            refresh.recompute_word_counts(args.config.get_libpq_dsn(),
                                          args.sqllib_dir)

        if args.address_levels:
            cfg = Path(args.config.ADDRESS_LEVEL_CONFIG)
            LOG.warning('Updating address levels from %s', cfg)
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.load_address_levels_from_file(conn, cfg)

        if args.functions:
            LOG.warning('Create functions')
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.create_functions(conn, args.config, args.diffs,
                                         args.enable_debug_statements)
                tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
                tokenizer.update_sql_functions(args.config)

        if args.wiki_data:
            data_path = Path(args.config.WIKIPEDIA_DATA_PATH
                             or args.project_dir)
            LOG.warning('Import wikipdia article importance from %s',
                        data_path)
            if refresh.import_wikipedia_articles(args.config.get_libpq_dsn(),
                                                 data_path) > 0:
                LOG.fatal('FATAL: Wikipedia importance dump file not found')
                return 1

        # Attention: importance MUST come after wiki data import.
        if args.importance:
            LOG.warning('Update importance values for database')
            with connect(args.config.get_libpq_dsn()) as conn:
                refresh.recompute_importance(conn)

        if args.website:
            webdir = args.project_dir / 'website'
            LOG.warning('Setting up website directory at %s', webdir)
            refresh.setup_website(webdir, args.config)

        return 0
Ejemplo n.º 4
0
def create_db(dsn, rouser=None):
    """ Create a new database for the given DSN. Fails when the database
        already exists or the PostgreSQL version is too old.
        Uses `createdb` to create the database.

        If 'rouser' is given, then the function also checks that the user
        with that given name exists.

        Requires superuser rights by the caller.
    """
    proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False)

    if proc.returncode != 0:
        raise UsageError('Creating new database failed.')

    with connect(dsn) as conn:
        postgres_version = conn.server_version_tuple()
        if postgres_version < POSTGRESQL_REQUIRED_VERSION:
            LOG.fatal('Minimum supported version of Postgresql is %d.%d. '
                      'Found version %d.%d.',
                      POSTGRESQL_REQUIRED_VERSION[0], POSTGRESQL_REQUIRED_VERSION[1],
                      postgres_version[0], postgres_version[1])
            raise UsageError('PostgreSQL server is too old.')

        if rouser is not None:
            with conn.cursor() as cur:
                cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s',
                                 (rouser, ))
                if cnt == 0:
                    LOG.fatal("Web user '%s' does not exists. Create it with:\n"
                              "\n      createuser %s", rouser, rouser)
                    raise UsageError('Missing read-only user.')
Ejemplo n.º 5
0
    def check_database(self, _):
        """ Check that the tokenizer is set up correctly.
        """
        hint = """\
             The Postgresql extension nominatim.so was not correctly loaded.

             Error: {error}

             Hints:
             * Check the output of the CMmake/make installation step
             * Does nominatim.so exist?
             * Does nominatim.so exist on the database server?
             * Can nominatim.so be accessed by the database user?
             """
        with connect(self.dsn) as conn:
            with conn.cursor() as cur:
                try:
                    out = cur.scalar("SELECT make_standard_name('a')")
                except psycopg2.Error as err:
                    return hint.format(error=str(err))

        if out != 'a':
            return hint.format(
                error='Unexpected result for make_standard_name()')

        return None
Ejemplo n.º 6
0
def setup_country_tables(dsn, sql_dir, ignore_partitions=False):
    """ Create and populate the tables with basic static data that provides
        the background for geocoding. Data is assumed to not yet exist.
    """
    db_utils.execute_file(dsn, sql_dir / 'country_name.sql')
    db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')

    params = []
    for ccode, props in _COUNTRY_INFO.items():
        if ccode is not None and props is not None:
            if ignore_partitions:
                partition = 0
            else:
                partition = props.get('partition')
            lang = props['languages'][0] if len(
                props['languages']) == 1 else None
            params.append((ccode, partition, lang))

    with connect(dsn) as conn:
        with conn.cursor() as cur:
            cur.execute_values(
                """ UPDATE country_name
                    SET partition = part, country_default_language_code = lang
                    FROM (VALUES %s) AS v (cc, part, lang)
                    WHERE country_code = v.cc""", params)
        conn.commit()
Ejemplo n.º 7
0
    def index_full(self, analyse=True):
        """ Index the complete database. This will first index boudnaries
            followed by all other objects. When `analyse` is True, then the
            database will be analysed at the appropriate places to
            ensure that database statistics are updated.
        """
        with connect(self.dsn) as conn:
            conn.autocommit = True

            if analyse:
                def _analyze():
                    with conn.cursor() as cur:
                        cur.execute('ANALYZE')
            else:
                def _analyze():
                    pass

            self.index_by_rank(0, 4)
            _analyze()

            self.index_boundaries(0, 30)
            _analyze()

            self.index_by_rank(5, 25)
            _analyze()

            self.index_by_rank(26, 30)
            _analyze()

            self.index_postcodes()
            _analyze()
Ejemplo n.º 8
0
def can_compute(dsn):
    """
        Check that the place table exists so that
        postcodes can be computed.
    """
    with connect(dsn) as conn:
        return conn.table_exists('place')
Ejemplo n.º 9
0
    def __init__(self, dsn, sanitizer, token_analysis):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
        self.sanitizer = sanitizer
        self.token_analysis = token_analysis

        self._cache = _TokenCache()
Ejemplo n.º 10
0
    def init_from_project(self, config):
        """ Initialise the tokenizer from the project directory.
        """
        self.loader = ICURuleLoader(config)

        with connect(self.dsn) as conn:
            self.loader.load_config_from_db(conn)
Ejemplo n.º 11
0
 def finalize_import(self, config):
     """ Do any required postprocessing to make the tokenizer data ready
         for use.
     """
     with connect(self.dsn) as conn:
         sqlp = SQLPreprocessor(conn, config)
         sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_indices.sql')
Ejemplo n.º 12
0
 def run(args):
     if args.import_from_wiki:
         LOG.warning('Special phrases importation starting')
         with connect(args.config.get_libpq_dsn()) as db_connection:
             SpecialPhrasesImporter(args.config, args.phplib_dir,
                                    db_connection).import_from_wiki()
     return 0
Ejemplo n.º 13
0
    def __init__(self, dsn, normalizer):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
        self.normalizer = normalizer
        psycopg2.extras.register_hstore(self.conn)

        self._cache = _TokenCache(self.conn)
Ejemplo n.º 14
0
def import_osm_data(osm_files, options, drop=False, ignore_errors=False):
    """ Import the given OSM files. 'options' contains the list of
        default settings for osm2pgsql.
    """
    options['import_file'] = osm_files
    options['append'] = False
    options['threads'] = 1

    if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
        # Make some educated guesses about cache size based on the size
        # of the import file and the available memory.
        mem = psutil.virtual_memory()
        fsize = 0
        if isinstance(osm_files, list):
            for fname in osm_files:
                fsize += os.stat(str(fname)).st_size
        else:
            fsize = os.stat(str(osm_files)).st_size
        options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75,
                                             fsize * 2) / 1024 / 1024) + 1

    run_osm2pgsql(options)

    with connect(options['dsn']) as conn:
        if not ignore_errors:
            with conn.cursor() as cur:
                cur.execute('SELECT * FROM place LIMIT 1')
                if cur.rowcount == 0:
                    raise UsageError('No data imported by osm2pgsql.')

        if drop:
            conn.drop_table('planet_osm_nodes')

    if drop and options['flatnode_file']:
        Path(options['flatnode_file']).unlink()
Ejemplo n.º 15
0
    def update_status_table(self):
        """ Update the status in the status table to 'indexed'.
        """
        with connect(self.dsn) as conn:
            with conn.cursor() as cur:
                cur.execute('UPDATE import_status SET indexed = true')

            conn.commit()
Ejemplo n.º 16
0
 def _init_db_tables(self, config):
     """ Set up the word table and fill it with pre-computed word
         frequencies.
     """
     with connect(self.dsn) as conn:
         sqlp = SQLPreprocessor(conn, config)
         sqlp.run_sql_file(conn, 'tokenizer/icu_tokenizer_tables.sql')
         conn.commit()
Ejemplo n.º 17
0
    def run(args):
        from ..tools import freeze

        with connect(args.config.get_libpq_dsn()) as conn:
            freeze.drop_update_tables(conn)
        freeze.drop_flatnode_file(str(args.config.get_path('FLATNODE_FILE')))

        return 0
Ejemplo n.º 18
0
    def __init__(self, dsn, normalizer, transliterator, abbreviations):
        self.conn = connect(dsn).connection
        self.conn.autocommit = True
        self.normalizer = normalizer
        self.transliterator = transliterator
        self.abbreviations = abbreviations

        self._cache = _TokenCache()
Ejemplo n.º 19
0
 def init_from_project(self):
     """ Initialise the tokenizer from the project directory.
     """
     with connect(self.dsn) as conn:
         self.normalization = get_property(conn, DBCFG_NORMALIZATION)
         self.transliteration = get_property(conn, DBCFG_TRANSLITERATION)
         self.abbreviations = json.loads(
             get_property(conn, DBCFG_ABBREVIATIONS))
Ejemplo n.º 20
0
 def update_sql_functions(self, config):
     """ Reimport the SQL functions for this tokenizer.
     """
     with connect(self.dsn) as conn:
         max_word_freq = get_property(conn, DBCFG_MAXWORDFREQ)
         sqlp = SQLPreprocessor(conn, config)
         sqlp.run_sql_file(conn,
                           'tokenizer/legacy_icu_tokenizer.sql',
                           max_word_freq=max_word_freq)
Ejemplo n.º 21
0
 def has_pending(self):
     """ Check if any data still needs indexing.
         This function must only be used after the import has finished.
         Otherwise it will be very expensive.
     """
     with connect(self.dsn) as conn:
         with conn.cursor() as cur:
             cur.execute(
                 "SELECT 'a' FROM placex WHERE indexed_status > 0 LIMIT 1")
             return cur.rowcount > 0
Ejemplo n.º 22
0
 def _save_config(self, config):
     """ Save the configuration that needs to remain stable for the given
         database as database properties.
     """
     with connect(self.dsn) as conn:
         set_property(conn, DBCFG_NORMALIZATION, self.normalization)
         set_property(conn, DBCFG_MAXWORDFREQ, config.MAX_WORD_FREQUENCY)
         set_property(conn, DBCFG_TRANSLITERATION, self.transliteration)
         set_property(conn, DBCFG_ABBREVIATIONS,
                      json.dumps(self.abbreviations))
Ejemplo n.º 23
0
def add_tiger_data(data_dir, config, threads):
    """ Import tiger data from directory or tar file `data dir`.
    """
    dsn = config.get_libpq_dsn()
    sql_files, tar = handle_tarfile_or_directory(data_dir)

    if not sql_files:
        return

    with connect(dsn) as conn:
        sql = SQLPreprocessor(conn, config)
        sql.run_sql_file(conn, 'tiger_import_start.sql')

    # Reading sql_files and then for each file line handling
    # sql_query in <threads - 1> chunks.
    sel = selectors.DefaultSelector()
    place_threads = max(1, threads - 1)

    # Creates a pool of database connections
    for _ in range(place_threads):
        conn = DBConnection(dsn)
        conn.connect()
        sel.register(conn, selectors.EVENT_WRITE, conn)

    for sql_file in sql_files:
        if not tar:
            file = open(sql_file)
        else:
            file = tar.extractfile(sql_file)

        handle_threaded_sql_statements(sel, file)

    # Unregistering pool of database connections
    handle_unregister_connection_pool(sel, place_threads)

    if tar:
        tar.close()
    print('\n')
    LOG.warning("Creating indexes on Tiger data")
    with connect(dsn) as conn:
        sql = SQLPreprocessor(conn, config)
        sql.run_sql_file(conn, 'tiger_import_finish.sql')
Ejemplo n.º 24
0
    def run(args):
        from ..tokenizer import factory as tokenizer_factory

        if args.import_from_wiki:
            LOG.warning('Special phrases importation starting')
            tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
            with connect(args.config.get_libpq_dsn()) as db_connection:
                SpecialPhrasesImporter(
                    args.config, args.phplib_dir,
                    db_connection).import_from_wiki(tokenizer)
        return 0
Ejemplo n.º 25
0
def update_postcodes(dsn, project_dir, tokenizer):
    """ Update the table of artificial postcodes.

        Computes artificial postcode centroids from the placex table,
        potentially enhances it with external data and then updates the
        postcodes in the table 'location_postcode'.
    """
    with tokenizer.name_analyzer() as analyzer:
        with connect(dsn) as conn:
            # First get the list of countries that currently have postcodes.
            # (Doing this before starting to insert, so it is fast on import.)
            with conn.cursor() as cur:
                cur.execute(
                    "SELECT DISTINCT country_code FROM location_postcode")
                todo_countries = set((row[0] for row in cur))

            # Recompute the list of valid postcodes from placex.
            with conn.cursor(name="placex_postcodes") as cur:
                cur.execute("""
                SELECT cc as country_code, pc, ST_X(centroid), ST_Y(centroid)
                FROM (SELECT
                        COALESCE(plx.country_code,
                                 get_country_code(ST_Centroid(pl.geometry))) as cc,
                        token_normalized_postcode(pl.address->'postcode') as pc,
                        ST_Centroid(ST_Collect(COALESCE(plx.centroid,
                                                        ST_Centroid(pl.geometry)))) as centroid
                      FROM place AS pl LEFT OUTER JOIN placex AS plx
                             ON pl.osm_id = plx.osm_id AND pl.osm_type = plx.osm_type
                    WHERE pl.address ? 'postcode' AND pl.geometry IS NOT null
                    GROUP BY cc, pc) xx
                WHERE pc IS NOT null AND cc IS NOT null
                ORDER BY country_code, pc""")

                collector = None

                for country, postcode, x, y in cur:
                    if collector is None or country != collector.country:
                        if collector is not None:
                            collector.commit(conn, analyzer, project_dir)
                        collector = _CountryPostcodesCollector(country)
                        todo_countries.discard(country)
                    collector.add(postcode, x, y)

                if collector is not None:
                    collector.commit(conn, analyzer, project_dir)

            # Now handle any countries that are only in the postcode table.
            for country in todo_countries:
                _CountryPostcodesCollector(country).commit(
                    conn, analyzer, project_dir)

            conn.commit()

        analyzer.update_postcodes_from_db()
Ejemplo n.º 26
0
    def _init_replication(args):
        from ..tools import replication, refresh

        LOG.warning("Initialising replication updates")
        with connect(args.config.get_libpq_dsn()) as conn:
            replication.init_replication(conn, base_url=args.config.REPLICATION_URL)
            if args.update_functions:
                LOG.warning("Create functions")
                refresh.create_functions(conn, args.config, args.sqllib_dir,
                                         True, False)
        return 0
Ejemplo n.º 27
0
    def _init_db_tables(self, config):
        """ Set up the word table and fill it with pre-computed word
            frequencies.
        """
        with connect(self.dsn) as conn:
            sqlp = SQLPreprocessor(conn, config)
            sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql')
            conn.commit()

        LOG.warning("Precomputing word tokens")
        db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
Ejemplo n.º 28
0
    def start_import(args, loader):
        """
            Create the SPImporter object containing the right
            sp loader and then start the import of special phrases.
        """
        from ..tokenizer import factory as tokenizer_factory

        tokenizer = tokenizer_factory.get_tokenizer_for_db(args.config)
        should_replace = not args.no_replace
        with connect(args.config.get_libpq_dsn()) as db_connection:
            SPImporter(args.config, db_connection,
                       loader).import_phrases(tokenizer, should_replace)
Ejemplo n.º 29
0
def import_base_data(dsn, sql_dir, ignore_partitions=False):
    """ Create and populate the tables with basic static data that provides
        the background for geocoding. Data is assumed to not yet exist.
    """
    db_utils.execute_file(dsn, sql_dir / 'country_name.sql')
    db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz')

    if ignore_partitions:
        with connect(dsn) as conn:
            with conn.cursor() as cur:
                cur.execute('UPDATE country_name SET partition = 0')
            conn.commit()
Ejemplo n.º 30
0
 def update_sql_functions(self, config):
     """ Reimport the SQL functions for this tokenizer.
     """
     with connect(self.dsn) as conn:
         max_word_freq = properties.get_property(conn, DBCFG_MAXWORDFREQ)
         modulepath = config.DATABASE_MODULE_PATH or \
                      str((config.project_dir / 'module').resolve())
         sqlp = SQLPreprocessor(conn, config)
         sqlp.run_sql_file(conn,
                           'tokenizer/legacy_tokenizer.sql',
                           max_word_freq=max_word_freq,
                           modulepath=modulepath)