def convert_csv_to_sqlite(csv_filename): sqlite_filename = "%s.sqlite" % os.path.splitext(csv_filename)[0] conn = sqlite3.connect(sqlite_filename) dataframes = [] csvs = {os.path.splitext(csv_filename)[0]: csv_filename} for name, path in csvs.items(): try: df = load_csv(path, separator=",", skip_errors=False, quoting=0, shape=None) df.table_name = "Items" dataframes.append(df) except LoadCsvError as e: print("Could not load {}: {}".format(path, e), file=sys.stderr) print("Loaded {} dataframes".format(len(dataframes))) assert len(dataframes) == 1 # Use extract_columns to build a column:(table,label) dictionary foreign_keys = {} column_map = [ (re.compile(r"Other Title(?:|_\d+)"), "Title"), (re.compile(r"Creator/Publisher(?:|_\d+)"), "Creator"), (re.compile(r"Subject(?:|_\d+)"), "Subject"), (re.compile(r"Collection(?:|_\d+)"), "Collection"), (re.compile(r"Era(?:|_\d+)"), "Era"), (re.compile(r"Location(?:|_\d+)"), "Location"), (re.compile(r"Geography(?:|_\d+)"), "Geography"), ] for column_name in dataframes[0].columns: for regex, mapped in column_map: if regex.search(column_name): foreign_keys[column_name] = (mapped, "value") # Now we have loaded the dataframes, we can refactor them refactored = refactor_dataframes(conn, dataframes, foreign_keys, False) for df in refactored: # This is a bit trickier because we need to # create the table with extra SQL for foreign keys if table_exists(conn, df.table_name): drop_table(conn, df.table_name) to_sql_with_foreign_keys(conn, df, df.table_name, foreign_keys) conn.close() print("Updated", sqlite_filename)
def test_table_exists(table, expected): conn = sqlite3.connect(':memory:') conn.executescript(TEST_TABLES) assert expected == utils.table_exists(conn, table)
import json import os import sqlite3 import sys import requests from csvs_to_sqlite.utils import table_exists from Levenshtein import ratio from utils import column_exists, normalize_place_name GEONAMES_API_USERNAME = os.environ.get("GEONAMES_API_USERNAME", "") db = sqlite3.connect(sys.argv[1], isolation_level=None) if not table_exists(db, "geonames_queries"): db.execute(""" CREATE TABLE geonames_queries ( id INTEGER PRIMARY KEY, value VARCHAR(255), fetched TIMESTAMP NULL, result_count INTEGER, geonames JSON NULL ) """) for table_name in ("Location", "Geography", "Subject"): if not column_exists(db, table_name, "latitude"): db.execute( f"ALTER TABLE {table_name} ADD COLUMN latitude VARCHAR(16) NULL") db.execute(