def summary(cur): total = myutils.queryValue(cur, 'SELECT COUNT(*) FROM swapi;') todo = myutils.queryValue( cur, 'SELECT COUNT(*) FROM swapi WHERE status IS NULL;') good = myutils.queryValue( cur, 'SELECT COUNT(*) FROM swapi WHERE status = 200;') error = myutils.queryValue( cur, 'SELECT COUNT(*) FROM swapi WHERE status != 200;') print(f'Total={total} todo={todo} good={good} error={error}')
port=secrets['port'], # database=secrets['database'], user=secrets['user'], password=secrets['pass'], connect_timeout=3) # https://stackoverflow.com/questions/34484066/create-a-postgres-database-using-python conn.autocommit = True cur = conn.cursor() sql = "select datname,oid,(pg_stat_file('base/'||oid ||'/PG_VERSION')).modification from pg_database where datname='pg4e_025ca';" sql = "SELECT datname FROM pg_database;" # row = cur.execute(sql, fields) sql = "SELECT setting FROM pg_settings WHERE name = 'data_directory';" data_directory = myutils.queryValue(cur, sql) print('Data directory', data_directory) # https://stackoverflow.com/questions/24806122/get-database-creation-date-on-postgresql sql = "SELECT datname,oid,(pg_stat_file('base/'||oid ||'/PG_VERSION')).modification FROM pg_database ORDER BY oid;" stmt = cur.execute(sql) expired = list() conn2 = False cur2 = False keep = 0 while True: if len(expired) > limit: break row = cur.fetchone() if not row: break db_name = row[0]
print('If you want to restart the spider, run') print('DROP TABLE IF EXISTS swapi CASCADE;') print(' ') sql = ''' CREATE TABLE IF NOT EXISTS swapi (id serial, url VARCHAR(2048) UNIQUE, status INTEGER, body JSONB, created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), updated_at TIMESTAMPTZ); ''' print(sql) cur.execute(sql) # Check to see if we have urls in the table, if not add starting points # for each of the object trees sql = 'SELECT COUNT(url) FROM swapi;' count = myutils.queryValue(cur, sql) if count < 1: objects = ['films', 'species', 'people'] for obj in objects: sql = f"INSERT INTO swapi (url) VALUES ( 'https://swapi.py4e.com/api/{obj}/1/' )" print(sql) cur.execute(sql, (defaulturl)) conn.commit() many = 0 count = 0 chars = 0 fail = 0 summary(cur) while True: if (many < 1):
port=secrets['port'], connect_timeout=5, database=secrets['database'], user=secrets['user'], password=secrets['pass']) cur = conn.cursor() baseurl = 'http://mbox.dr-chuck.net/sakai.devel/' cur.execute('''CREATE TABLE IF NOT EXISTS messages (id SERIAL, email TEXT, sent_at TIMESTAMPTZ, subject TEXT, headers TEXT, body TEXT)''') # Pick up where we left off sql = 'SELECT max(id) FROM messages' start = myutils.queryValue(cur, sql) if start is None: start = 0 many = 0 count = 0 fail = 0 while True: if (many < 1): conn.commit() sval = input('How many messages:') if (len(sval) < 1): break many = int(sval) start = start + 1 # Skip rows that are already retrieved