def setup_country_tables(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides the background for geocoding. Data is assumed to not yet exist. """ db_utils.execute_file(dsn, sql_dir / 'country_name.sql') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') params = [] for ccode, props in _COUNTRY_INFO.items(): if ccode is not None and props is not None: if ignore_partitions: partition = 0 else: partition = props.get('partition') lang = props['languages'][0] if len( props['languages']) == 1 else None params.append((ccode, partition, lang)) with connect(dsn) as conn: with conn.cursor() as cur: cur.execute_values( """ UPDATE country_name SET partition = part, country_default_language_code = lang FROM (VALUES %s) AS v (cc, part, lang) WHERE country_code = v.cc""", params) conn.commit()
def import_wikipedia_articles(dsn, data_path, ignore_errors=False): """ Replaces the wikipedia importance tables with new data. The import is run in a single transaction so that the new data is replace seemlessly. Returns 0 if all was well and 1 if the importance file could not be found. Throws an exception if there was an error reading the file. """ datafile = data_path / 'wikimedia-importance.sql.gz' if not datafile.exists(): return 1 pre_code = """BEGIN; DROP TABLE IF EXISTS "wikipedia_article"; DROP TABLE IF EXISTS "wikipedia_redirect" """ post_code = "COMMIT" execute_file(dsn, datafile, ignore_errors=ignore_errors, pre_code=pre_code, post_code=post_code) return 0
def test_execute_file_with_pre_code(dsn, tmp_path, temp_db_cursor): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('INSERT INTO test VALUES(4)') db_utils.execute_file(dsn, tmpfile, pre_code='CREATE TABLE test (id INT)') assert temp_db_cursor.row_set('SELECT * FROM test') == {(4, )}
def test_execute_file_success(dsn, temp_db_cursor, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text( 'CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') db_utils.execute_file(dsn, tmpfile) assert temp_db_cursor.row_set('SELECT * FROM test') == {(56, )}
def test_execute_file_success(dsn, temp_db_cursor, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text( 'CREATE TABLE test (id INT);\nINSERT INTO test VALUES(56);') db_utils.execute_file(dsn, tmpfile) temp_db_cursor.execute('SELECT * FROM test') assert temp_db_cursor.rowcount == 1 assert temp_db_cursor.fetchone()[0] == 56
def _init_db_tables(self, config): """ Set up the word table and fill it with pre-computed word frequencies. """ with connect(self.dsn) as conn: sqlp = SQLPreprocessor(conn, config) sqlp.run_sql_file(conn, 'tokenizer/legacy_tokenizer_tables.sql') conn.commit() LOG.warning("Precomputing word tokens") db_utils.execute_file(self.dsn, config.lib_dir.data / 'words.sql')
def test_execute_file_with_post_code(dsn, tmp_path, temp_db_cursor): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE TABLE test (id INT)') db_utils.execute_file(dsn, tmpfile, post_code='INSERT INTO test VALUES(23)') temp_db_cursor.execute('SELECT * FROM test') assert temp_db_cursor.rowcount == 1 assert temp_db_cursor.fetchone()[0] == 23
def import_base_data(dsn, sql_dir, ignore_partitions=False): """ Create and populate the tables with basic static data that provides the background for geocoding. Data is assumed to not yet exist. """ db_utils.execute_file(dsn, sql_dir / 'country_name.sql') db_utils.execute_file(dsn, sql_dir / 'country_osm_grid.sql.gz') if ignore_partitions: with connect(dsn) as conn: with conn.cursor() as cur: cur.execute('UPDATE country_name SET partition = 0') conn.commit()
def load_data(dsn, data_dir, threads): """ Copy data into the word and placex table. """ # Pre-calculate the most important terms in the word list. db_utils.execute_file(dsn, data_dir / 'words.sql') sel = selectors.DefaultSelector() # Then copy data from place to placex in <threads - 1> chunks. place_threads = max(1, threads - 1) for imod in range(place_threads): conn = DBConnection(dsn) conn.connect() conn.perform("""INSERT INTO placex ({0}) SELECT {0} FROM place WHERE osm_id % {1} = {2} AND NOT (class='place' and type='houses') AND ST_IsValid(geometry) """.format(_COPY_COLUMNS, place_threads, imod)) sel.register(conn, selectors.EVENT_READ, conn) # Address interpolations go into another table. conn = DBConnection(dsn) conn.connect() conn.perform("""INSERT INTO location_property_osmline (osm_id, address, linegeo) SELECT osm_id, address, geometry FROM place WHERE class='place' and type='houses' and osm_type='W' and ST_GeometryType(geometry) = 'ST_LineString' """) sel.register(conn, selectors.EVENT_READ, conn) # Now wait for all of them to finish. todo = place_threads + 1 while todo > 0: for key, _ in sel.select(1): conn = key.data sel.unregister(conn) conn.wait() conn.close() todo -= 1 print('.', end='', flush=True) print('\n') with connect(dsn) as conn: with conn.cursor() as cur: cur.execute('ANALYSE')
def test_execute_file_bad_sql_ignore_errors(dsn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE STABLE test (id INT)') db_utils.execute_file(dsn, tmpfile, ignore_errors=True)
def test_execute_file_bad_sql(dsn, tmp_path): tmpfile = tmp_path / 'test.sql' tmpfile.write_text('CREATE STABLE test (id INT)') with pytest.raises(UsageError): db_utils.execute_file(dsn, tmpfile)
def test_execute_file_bad_file(dsn, tmp_path): with pytest.raises(FileNotFoundError): db_utils.execute_file(dsn, tmp_path / 'test2.sql')
def recompute_word_counts(dsn, sql_dir): """ Compute the frequency of full-word search terms. """ execute_file(dsn, sql_dir / 'words_from_search_name.sql')
def update_postcodes(dsn, sql_dir): """ Recalculate postcode centroids and add, remove and update entries in the location_postcode table. `conn` is an opne connection to the database. """ execute_file(dsn, sql_dir / 'update-postcodes.sql')
def import_postcodes(dsn, project_dir, tokenizer): """ Set up the initial list of postcodes. """ with connect(dsn) as conn: conn.drop_table('gb_postcode') conn.drop_table('us_postcode') with conn.cursor() as cur: cur.execute("""CREATE TABLE gb_postcode ( id integer, postcode character varying(9), geometry GEOMETRY(Point, 4326))""") with conn.cursor() as cur: cur.execute("""CREATE TABLE us_postcode ( postcode text, x double precision, y double precision)""") conn.commit() gb_postcodes = project_dir / 'gb_postcode_data.sql.gz' if gb_postcodes.is_file(): execute_file(dsn, gb_postcodes) us_postcodes = project_dir / 'us_postcode_data.sql.gz' if us_postcodes.is_file(): execute_file(dsn, us_postcodes) with conn.cursor() as cur: cur.execute("TRUNCATE location_postcode") cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) SELECT nextval('seq_place'), 1, country_code, token_normalized_postcode(address->'postcode') as pc, ST_Centroid(ST_Collect(ST_Centroid(geometry))) FROM placex WHERE address ? 'postcode' and token_normalized_postcode(address->'postcode') is not null AND geometry IS NOT null GROUP BY country_code, pc """) cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) SELECT nextval('seq_place'), 1, 'us', token_normalized_postcode(postcode), ST_SetSRID(ST_Point(x,y),4326) FROM us_postcode WHERE token_normalized_postcode(postcode) NOT IN (SELECT postcode FROM location_postcode WHERE country_code = 'us') """) cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) SELECT nextval('seq_place'), 1, 'gb', token_normalized_postcode(postcode), geometry FROM gb_postcode WHERE token_normalized_postcode(postcode) NOT IN (SELECT postcode FROM location_postcode WHERE country_code = 'gb') """) cur.execute(""" DELETE FROM word WHERE class='place' and type='postcode' and word NOT IN (SELECT postcode FROM location_postcode) """) conn.commit() with tokenizer.name_analyzer() as analyzer: analyzer.add_postcodes_from_db()
def import_postcodes(dsn, project_dir): """ Set up the initial list of postcodes. """ with connect(dsn) as conn: conn.drop_table('gb_postcode') conn.drop_table('us_postcode') with conn.cursor() as cur: cur.execute("""CREATE TABLE gb_postcode ( id integer, postcode character varying(9), geometry GEOMETRY(Point, 4326))""") with conn.cursor() as cur: cur.execute("""CREATE TABLE us_postcode ( postcode text, x double precision, y double precision)""") conn.commit() gb_postcodes = project_dir / 'gb_postcode_data.sql.gz' if gb_postcodes.is_file(): execute_file(dsn, gb_postcodes) us_postcodes = project_dir / 'us_postcode_data.sql.gz' if us_postcodes.is_file(): execute_file(dsn, us_postcodes) with conn.cursor() as cur: cur.execute("TRUNCATE location_postcode") cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) SELECT nextval('seq_place'), 1, country_code, upper(trim (both ' ' from address->'postcode')) as pc, ST_Centroid(ST_Collect(ST_Centroid(geometry))) FROM placex WHERE address ? 'postcode' AND address->'postcode' NOT SIMILAR TO '%(,|;)%' AND geometry IS NOT null GROUP BY country_code, pc """) cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) SELECT nextval('seq_place'), 1, 'us', postcode, ST_SetSRID(ST_Point(x,y),4326) FROM us_postcode WHERE postcode NOT IN (SELECT postcode FROM location_postcode WHERE country_code = 'us') """) cur.execute(""" INSERT INTO location_postcode (place_id, indexed_status, country_code, postcode, geometry) SELECT nextval('seq_place'), 1, 'gb', postcode, geometry FROM gb_postcode WHERE postcode NOT IN (SELECT postcode FROM location_postcode WHERE country_code = 'gb') """) cur.execute(""" DELETE FROM word WHERE class='place' and type='postcode' and word NOT IN (SELECT postcode FROM location_postcode) """) cur.execute(""" SELECT count(getorcreate_postcode_id(v)) FROM (SELECT distinct(postcode) as v FROM location_postcode) p """) conn.commit()