Exemple #1
0
    def find_config_file(self, filename, config=None):
        """ Resolve the location of a configuration file given a filename and
            an optional configuration option with the file name.
            Raises a UsageError when the file cannot be found or is not
            a regular file.
        """
        if config is not None:
            cfg_filename = self.__getattr__(config)
            if cfg_filename:
                cfg_filename = Path(cfg_filename)

                if cfg_filename.is_absolute():
                    cfg_filename = cfg_filename.resolve()

                    if not cfg_filename.is_file():
                        LOG.fatal("Cannot find config file '%s'.",
                                  cfg_filename)
                        raise UsageError("Config file not found.")

                    return cfg_filename

                filename = cfg_filename

        search_paths = [self.project_dir, self.config_dir]
        for path in search_paths:
            if path is not None and (path / filename).is_file():
                return path / filename

        LOG.fatal(
            "Configuration file '%s' not found.\nDirectories searched: %s",
            filename, search_paths)
        raise UsageError("Config file not found.")
def create_db(dsn, rouser=None):
    """ Create a new database for the given DSN. Fails when the database
        already exists or the PostgreSQL version is too old.
        Uses `createdb` to create the database.

        If 'rouser' is given, then the function also checks that the user
        with that given name exists.

        Requires superuser rights by the caller.
    """
    proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False)

    if proc.returncode != 0:
        raise UsageError('Creating new database failed.')

    with connect(dsn) as conn:
        postgres_version = conn.server_version_tuple()
        if postgres_version < POSTGRESQL_REQUIRED_VERSION:
            LOG.fatal('Minimum supported version of Postgresql is %d.%d. '
                      'Found version %d.%d.',
                      POSTGRESQL_REQUIRED_VERSION[0], POSTGRESQL_REQUIRED_VERSION[1],
                      postgres_version[0], postgres_version[1])
            raise UsageError('PostgreSQL server is too old.')

        if rouser is not None:
            with conn.cursor() as cur:
                cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s',
                                 (rouser, ))
                if cnt == 0:
                    LOG.fatal("Web user '%s' does not exists. Create it with:\n"
                              "\n      createuser %s", rouser, rouser)
                    raise UsageError('Missing read-only user.')
Exemple #3
0
def compute_database_date(conn):
    """ Determine the date of the database from the newest object in the
        data base.
    """
    # First, find the node with the highest ID in the database
    with conn.cursor() as cur:
        osmid = cur.scalar("SELECT max(osm_id) FROM place WHERE osm_type='N'")

        if osmid is None:
            LOG.fatal("No data found in the database.")
            raise UsageError("No data found in the database.")

    LOG.info("Using node id %d for timestamp lookup", osmid)
    # Get the node from the API to find the timestamp when it was created.
    node_url = 'https://www.openstreetmap.org/api/0.6/node/{}/1'.format(osmid)
    data = get_url(node_url)

    match = re.search(
        r'timestamp="((\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2}):(\d{2}))Z"',
        data)

    if match is None:
        LOG.fatal(
            "The node data downloaded from the API does not contain valid data.\n"
            "URL used: %s", node_url)
        raise UsageError("Bad API data.")

    LOG.debug("Found timestamp %s", match[1])

    return dt.datetime.fromisoformat(match[1]).replace(tzinfo=dt.timezone.utc)
    def _convert_php_settings_if_needed(self, file_path):
        """
            Convert php settings file of special phrases to json file if it is still in php format.
        """
        if not isfile(file_path):
            raise UsageError(str(file_path) + ' is not a valid file.')

        file, extension = os.path.splitext(file_path)
        json_file_path = Path(file + '.json').resolve()

        if extension not in('.php', '.json'):
            raise UsageError('The custom NOMINATIM_PHRASE_CONFIG file has not a valid extension.')

        if extension == '.php' and not isfile(json_file_path):
            try:
                subprocess.run(['/usr/bin/env', 'php', '-Cq',
                                (self.phplib_dir / 'migration/PhraseSettingsToJson.php').resolve(),
                                file_path], check=True)
                LOG.warning('special_phrase configuration file has been converted to json.')
                return json_file_path
            except subprocess.CalledProcessError:
                LOG.error('Error while converting %s to json.', file_path)
                raise
        else:
            return json_file_path
Exemple #5
0
    def _update(args):
        from ..tools import replication
        from ..indexer.indexer import Indexer

        params = args.osm2pgsql_options(default_cache=2000, default_threads=1)
        params.update(base_url=args.config.REPLICATION_URL,
                      update_interval=args.config.get_int('REPLICATION_UPDATE_INTERVAL'),
                      import_file=args.project_dir / 'osmosischange.osc',
                      max_diff_size=args.config.get_int('REPLICATION_MAX_DIFF'),
                      indexed_only=not args.once)

        # Sanity check to not overwhelm the Geofabrik servers.
        if 'download.geofabrik.de'in params['base_url']\
           and params['update_interval'] < 86400:
            LOG.fatal("Update interval too low for download.geofabrik.de.\n"
                      "Please check install documentation "
                      "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
                      "setting-up-the-update-process).")
            raise UsageError("Invalid replication update interval setting.")

        if not args.once:
            if not args.do_index:
                LOG.fatal("Indexing cannot be disabled when running updates continuously.")
                raise UsageError("Bad argument '--no-index'.")
            recheck_interval = args.config.get_int('REPLICATION_RECHECK_INTERVAL')

        while True:
            with connect(args.config.get_libpq_dsn()) as conn:
                start = dt.datetime.now(dt.timezone.utc)
                state = replication.update(conn, params)
                if state is not replication.UpdateState.NO_CHANGES:
                    status.log_status(conn, start, 'import')
                batchdate, _, _ = status.get_status(conn)

            if state is not replication.UpdateState.NO_CHANGES and args.do_index:
                index_start = dt.datetime.now(dt.timezone.utc)
                indexer = Indexer(args.config.get_libpq_dsn(),
                                  args.threads or 1)
                indexer.index_boundaries(0, 30)
                indexer.index_by_rank(0, 30)

                with connect(args.config.get_libpq_dsn()) as conn:
                    status.set_indexed(conn, True)
                    status.log_status(conn, index_start, 'index')
            else:
                index_start = None

            if LOG.isEnabledFor(logging.WARNING):
                UpdateReplication._report_update(batchdate, start, index_start)

            if args.once:
                break

            if state is replication.UpdateState.NO_CHANGES:
                LOG.warning("No new changes. Sleeping for %d sec.", recheck_interval)
                time.sleep(recheck_interval)
Exemple #6
0
    def load_sub_configuration(self, filename, config=None):
        """ Load additional configuration from a file. `filename` is the name
            of the configuration file. The file is first searched in the
            project directory and then in the global settings dirctory.

            If `config` is set, then the name of the configuration file can
            be additionally given through a .env configuration option. When
            the option is set, then the file will be exclusively loaded as set:
            if the name is an absolute path, the file name is taken as is,
            if the name is relative, it is taken to be relative to the
            project directory.

            The format of the file is determined from the filename suffix.
            Currently only files with extension '.yaml' are supported.

            YAML files support a special '!include' construct. When the
            directive is given, the value is taken to be a filename, the file
            is loaded using this function and added at the position in the
            configuration tree.
        """
        configfile = self.find_config_file(filename, config)

        if configfile.suffix in ('.yaml', '.yml'):
            return self._load_from_yaml(configfile)

        if configfile.suffix == '.json':
            with configfile.open('r') as cfg:
                return json.load(cfg)

        raise UsageError(f"Config file '{configfile}' has unknown format.")
Exemple #7
0
def handle_tarfile_or_directory(data_dir):
    """ Handles tarfile or directory for importing tiger data
    """

    tar = None
    if data_dir.endswith('.tar.gz'):
        try:
            tar = tarfile.open(data_dir)
        except tarfile.ReadError as err:
            LOG.fatal("Cannot open '%s'. Is this a tar file?", data_dir)
            raise UsageError("Cannot open Tiger data file.") from err

        csv_files = [i for i in tar.getmembers() if i.name.endswith('.csv')]
        LOG.warning("Found %d CSV files in tarfile with path %s",
                    len(csv_files), data_dir)
        if not csv_files:
            LOG.warning(
                "Tiger data import selected but no files in tarfile's path %s",
                data_dir)
            return None, None
    else:
        files = os.listdir(data_dir)
        csv_files = [
            os.path.join(data_dir, i) for i in files if i.endswith('.csv')
        ]
        LOG.warning("Found %d CSV files in path %s", len(csv_files), data_dir)
        if not csv_files:
            LOG.warning(
                "Tiger data import selected but no files found in path %s",
                data_dir)
            return None, None

    return csv_files, tar
def import_osm_data(osm_files, options, drop=False, ignore_errors=False):
    """ Import the given OSM files. 'options' contains the list of
        default settings for osm2pgsql.
    """
    options['import_file'] = osm_files
    options['append'] = False
    options['threads'] = 1

    if not options['flatnode_file'] and options['osm2pgsql_cache'] == 0:
        # Make some educated guesses about cache size based on the size
        # of the import file and the available memory.
        mem = psutil.virtual_memory()
        fsize = 0
        if isinstance(osm_files, list):
            for fname in osm_files:
                fsize += os.stat(str(fname)).st_size
        else:
            fsize = os.stat(str(osm_files)).st_size
        options['osm2pgsql_cache'] = int(min((mem.available + mem.cached) * 0.75,
                                             fsize * 2) / 1024 / 1024) + 1

    run_osm2pgsql(options)

    with connect(options['dsn']) as conn:
        if not ignore_errors:
            with conn.cursor() as cur:
                cur.execute('SELECT * FROM place LIMIT 1')
                if cur.rowcount == 0:
                    raise UsageError('No data imported by osm2pgsql.')

        if drop:
            conn.drop_table('planet_osm_nodes')

    if drop and options['flatnode_file']:
        Path(options['flatnode_file']).unlink()
Exemple #9
0
def create(func):
    """ Create a name processing function that splits name values with
        multiple values into their components.
    """
    delimiter_set = set(func.get('delimiters', ',;'))
    if not delimiter_set:
        raise UsageError(
            "Set of delimiters in split-name-list sanitizer is empty.")

    regexp = re.compile('\\s*[{}]\\s*'.format(''.join('\\' + d
                                                      for d in delimiter_set)))

    def _process(obj):
        if not obj.names:
            return

        new_names = []
        for name in obj.names:
            split_names = regexp.split(name.name)
            if len(split_names) == 1:
                new_names.append(name)
            else:
                new_names.extend(name.clone(name=n) for n in split_names if n)

        obj.names = new_names

    return _process
Exemple #10
0
    def check_csv_validity(self):
        """
            Check that the csv file has the right extension.
        """
        _, extension = os.path.splitext(self.csv_path)

        if extension != '.csv':
            raise UsageError('The file {} is not a csv file.'.format(self.csv_path))
Exemple #11
0
 def get_int(self, name):
     """ Return the given configuration parameter as an int.
     """
     try:
         return int(self.__getattr__(name))
     except ValueError:
         LOG.fatal("Invalid setting NOMINATIM_%s. Needs to be a number.",
                   name)
         raise UsageError("Configuration error.")
def _require_version(module, actual, expected):
    """ Compares the version for the given module and raises an exception
        if the actual version is too old.
    """
    if actual < expected:
        LOG.fatal('Minimum supported version of %s is %d.%d. '
                  'Found version %d.%d.',
                  module, expected[0], expected[1], actual[0], actual[1])
        raise UsageError(f'{module} is too old.')
Exemple #13
0
def _get_section(rules, section):
    """ Get the section named 'section' from the rules. If the section does
        not exist, raise a usage error with a meaningful message.
    """
    if section not in rules:
        LOG.fatal("Section '%s' not found in tokenizer config.", section)
        raise UsageError("Syntax error in tokenizer configuration file.")

    return rules[section]
Exemple #14
0
def _pipe_to_proc(proc, fdesc):
    chunk = fdesc.read(2048)
    while chunk and proc.poll() is None:
        try:
            proc.stdin.write(chunk)
        except BrokenPipeError as exc:
            raise UsageError("Failed to execute SQL file.") from exc
        chunk = fdesc.read(2048)

    return len(chunk)
def setup_database_skeleton(dsn, rouser=None):
    """ Create a new database for Nominatim and populate it with the
        essential extensions.

        The function fails when the database already exists or Postgresql or
        PostGIS versions are too old.

        Uses `createdb` to create the database.

        If 'rouser' is given, then the function also checks that the user
        with that given name exists.

        Requires superuser rights by the caller.
    """
    proc = subprocess.run(['createdb'], env=get_pg_env(dsn), check=False)

    if proc.returncode != 0:
        raise UsageError('Creating new database failed.')

    with connect(dsn) as conn:
        _require_version('PostgreSQL server',
                         conn.server_version_tuple(),
                         POSTGRESQL_REQUIRED_VERSION)

        if rouser is not None:
            with conn.cursor() as cur:
                cnt = cur.scalar('SELECT count(*) FROM pg_user where usename = %s',
                                 (rouser, ))
                if cnt == 0:
                    LOG.fatal("Web user '%s' does not exists. Create it with:\n"
                              "\n      createuser %s", rouser, rouser)
                    raise UsageError('Missing read-only user.')

        # Create extensions.
        with conn.cursor() as cur:
            cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
            cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
        conn.commit()

        _require_version('PostGIS',
                         conn.postgis_version_tuple(),
                         POSTGIS_REQUIRED_VERSION)
Exemple #16
0
def analyse_indexing(conn, osm_id=None, place_id=None):
    """ Analyse indexing of a single Nominatim object.
    """
    with conn.cursor() as cur:
        if osm_id:
            osm_type = osm_id[0].upper()
            if osm_type not in 'NWR' or not osm_id[1:].isdigit():
                LOG.fatal('OSM ID must be of form <N|W|R><id>. Got: %s',
                          osm_id)
                raise UsageError("OSM ID parameter badly formatted")
            cur.execute(
                'SELECT place_id FROM placex WHERE osm_type = %s AND osm_id = %s',
                (osm_type, osm_id[1:]))

            if cur.rowcount < 1:
                LOG.fatal("OSM object %s not found in database.", osm_id)
                raise UsageError("OSM object not found")

            place_id = cur.fetchone()[0]

        if place_id is None:
            LOG.fatal("No OSM object given to index.")
            raise UsageError("OSM object not found")

        cur.execute("update placex set indexed_status = 2 where place_id = %s",
                    (place_id, ))

        cur.execute("""SET auto_explain.log_min_duration = '0';
                       SET auto_explain.log_analyze = 'true';
                       SET auto_explain.log_nested_statements = 'true';
                       LOAD 'auto_explain';
                       SET client_min_messages = LOG;
                       SET log_min_messages = FATAL""")

        cur.execute("update placex set indexed_status = 0 where place_id = %s",
                    (place_id, ))

    # we do not want to keep the results
    conn.rollback()

    for msg in conn.notices:
        print(msg)
Exemple #17
0
    def _parse_variant_word(self, name):
        name = name.strip()
        match = re.fullmatch(r'([~^]?)([^~$^]*)([~$]?)', name)
        if match is None or (match.group(1) == '~' and match.group(3) == '~'):
            raise UsageError(
                "Invalid variant word descriptor '{}'".format(name))
        norm_name = self.norm.transliterate(match.group(2)).strip()
        if not norm_name:
            return None

        return norm_name, match.group(1), match.group(3)
    def _check_sanity(self, lang, phrase_class, phrase_type):
        """
            Check sanity of given inputs in case somebody added garbage in the wiki.
            If a bad class/type is detected the system will exit with an error.
        """
        type_matchs = self.sanity_check_pattern.findall(phrase_type)
        class_matchs = self.sanity_check_pattern.findall(phrase_class)

        if len(class_matchs) < 1 or len(type_matchs) < 1:
            raise UsageError("Bad class/type for language {}: {}={}".format(
                lang, phrase_class, phrase_type))
Exemple #19
0
    def run(args):
        if args.import_from_wiki:
            ImportSpecialPhrases.start_import(args, SPWikiLoader(args.config))

        if args.import_from_csv:
            if not Path(args.import_from_csv).is_file():
                LOG.fatal("CSV file '%s' does not exist.", args.import_from_csv)
                raise UsageError('Cannot access file.')

            ImportSpecialPhrases.start_import(args, SPCsvLoader(args.import_from_csv))

        return 0
    def __init__(self, rules):
        self.handlers = []

        if rules:
            for func in rules:
                if 'step' not in func:
                    raise UsageError(
                        "Sanitizer rule is missing the 'step' attribute.")
                module_name = 'nominatim.tokenizer.sanitizers.' + func[
                    'step'].replace('-', '_')
                handler_module = importlib.import_module(module_name)
                self.handlers.append(handler_module.create(func))
Exemple #21
0
def _run_api(endpoint, args, params):
    script_file = args.project_dir / 'website' / (endpoint + '.php')

    if not script_file.exists():
        LOG.error("Cannot find API script file.\n\n"
                  "Make sure to run 'nominatim' from the project directory \n"
                  "or use the option --project-dir.")
        raise UsageError("API script not found.")

    return run_api_script(endpoint,
                          args.project_dir,
                          phpcgi_bin=args.phpcgi_path,
                          params=params)
Exemple #22
0
def connect(dsn):
    """ Open a connection to the database using the specialised connection
        factory. The returned object may be used in conjunction with 'with'.
        When used outside a context manager, use the `connection` attribute
        to get the connection.
    """
    try:
        conn = psycopg2.connect(dsn, connection_factory=_Connection)
        ctxmgr = contextlib.closing(conn)
        ctxmgr.connection = conn
        return ctxmgr
    except psycopg2.OperationalError as err:
        raise UsageError("Cannot connect to database: {}".format(err)) from err
Exemple #23
0
def _check_module(module_dir, conn):
    """ Try to use the PostgreSQL module to confirm that it is correctly
        installed and accessible from PostgreSQL.
    """
    with conn.cursor() as cur:
        try:
            cur.execute("""CREATE FUNCTION nominatim_test_import_func(text)
                           RETURNS text AS '{}/nominatim.so', 'transliteration'
                           LANGUAGE c IMMUTABLE STRICT;
                           DROP FUNCTION nominatim_test_import_func(text)
                        """.format(module_dir))
        except psycopg2.DatabaseError as err:
            LOG.fatal("Error accessing database module: %s", err)
            raise UsageError("Database module cannot be accessed.") from err
Exemple #24
0
def update(conn, options):
    """ Update database from the next batch of data. Returns the state of
        updates according to `UpdateState`.
    """
    startdate, startseq, indexed = status.get_status(conn)

    if startseq is None:
        LOG.error("Replication not set up. "
                  "Please run 'nominatim replication --init' first.")
        raise UsageError("Replication not set up.")

    if not indexed and options['indexed_only']:
        LOG.info("Skipping update. There is data that needs indexing.")
        return UpdateState.MORE_PENDING

    last_since_update = dt.datetime.now(dt.timezone.utc) - startdate
    update_interval = dt.timedelta(seconds=options['update_interval'])
    if last_since_update < update_interval:
        duration = (update_interval - last_since_update).seconds
        LOG.warning("Sleeping for %s sec before next update.", duration)
        time.sleep(duration)

    if options['import_file'].exists():
        options['import_file'].unlink()

    # Read updates into file.
    repl = ReplicationServer(options['base_url'])

    outhandler = WriteHandler(str(options['import_file']))
    endseq = repl.apply_diffs(outhandler,
                              startseq + 1,
                              max_size=options['max_diff_size'] * 1024)
    outhandler.close()

    if endseq is None:
        return UpdateState.NO_CHANGES

    # Consume updates with osm2pgsql.
    options['append'] = True
    options['disable_jit'] = conn.server_version_tuple() >= (11, 0)
    run_osm2pgsql(options)

    # Write the current status to the file
    endstate = repl.get_state_info(endseq)
    status.set_status(conn,
                      endstate.timestamp if endstate else None,
                      seq=endseq,
                      indexed=False)

    return UpdateState.UP_TO_DATE
Exemple #25
0
    def _setup_analysis(self):
        """ Process the rules used for creating the various token analyzers.
        """
        self.analysis = {}

        if not isinstance(self.analysis_rules, list):
            raise UsageError(
                "Configuration section 'token-analysis' must be a list.")

        for section in self.analysis_rules:
            name = section.get('id', None)
            if name in self.analysis:
                if name is None:
                    LOG.fatal(
                        "ICU tokenizer configuration has two default token analyzers."
                    )
                else:
                    LOG.fatal(
                        "ICU tokenizer configuration has two token "
                        "analyzers with id '%s'.", name)
                raise UsageError("Syntax error in ICU tokenizer config.")
            self.analysis[name] = TokenAnalyzerRule(section,
                                                    self.normalization_rules)
Exemple #26
0
    def get_osm_file_list(self):
        """ Return the --osm-file argument as a list of Paths or None
            if no argument was given. The function also checks if the files
            exist and raises a UsageError if one cannot be found.
        """
        if not self.osm_file:
            return None

        files = [Path(f) for f in self.osm_file]
        for fname in files:
            if not fname.is_file():
                LOG.fatal("OSM file '%s' does not exist.", fname)
                raise UsageError('Cannot access file.')

        return files
Exemple #27
0
    def _compute_update_interval(args):
        if args.catch_up:
            return 0

        update_interval = args.config.get_int('REPLICATION_UPDATE_INTERVAL')
        # Sanity check to not overwhelm the Geofabrik servers.
        if 'download.geofabrik.de' in args.config.REPLICATION_URL\
           and update_interval < 86400:
            LOG.fatal(
                "Update interval too low for download.geofabrik.de.\n"
                "Please check install documentation "
                "(https://nominatim.org/release-docs/latest/admin/Import-and-Update#"
                "setting-up-the-update-process).")
            raise UsageError("Invalid replication update interval setting.")

        return update_interval
def setup_extensions(conn):
    """ Set up all extensions needed for Nominatim. Also checks that the
        versions of the extensions are sufficient.
    """
    with conn.cursor() as cur:
        cur.execute('CREATE EXTENSION IF NOT EXISTS hstore')
        cur.execute('CREATE EXTENSION IF NOT EXISTS postgis')
    conn.commit()

    postgis_version = conn.postgis_version_tuple()
    if postgis_version < POSTGIS_REQUIRED_VERSION:
        LOG.fatal('Minimum supported version of PostGIS is %d.%d. '
                  'Found version %d.%d.',
                  POSTGIS_REQUIRED_VERSION[0], POSTGIS_REQUIRED_VERSION[1],
                  postgis_version[0], postgis_version[1])
        raise UsageError('PostGIS version is too old.')
Exemple #29
0
def _guess_version(conn):
    """ Guess a database version when there is no property table yet.
        Only migrations for 3.6 and later are supported, so bail out
        when the version seems older.
    """
    with conn.cursor() as cur:
        # In version 3.6, the country_name table was updated. Check for that.
        cnt = cur.scalar("""SELECT count(*) FROM
                            (SELECT svals(name) FROM  country_name
                             WHERE country_code = 'gb')x;
                         """)
        if cnt < 100:
            LOG.fatal('It looks like your database was imported with a version '
                      'prior to 3.6.0. Automatic migration not possible.')
            raise UsageError('Migration not possible.')

    return (3, 5, 0, 99)
Exemple #30
0
def flatten_config_list(content, section=''):
    """ Flatten YAML configuration lists that contain include sections
        which are lists themselves.
    """
    if not content:
        return []

    if not isinstance(content, list):
        raise UsageError(f"List expected in section '{section}'.")

    output = []
    for ele in content:
        if isinstance(ele, list):
            output.extend(flatten_config_list(ele, section))
        else:
            output.append(ele)

    return output