def _top_rec(self, cur): t = self.type _log.info('fetching one %s', t) q = f'SELECT {t}_data FROM ol.{t} LIMIT 1' cur.execute(q) data, = cur.fetchone() _log.debug('got %r', data) return data __gr_work = GR('work') __gr_book = GR('book') __ol_edition = OL('edition') __ol_work = OL('work') _log = script_log(__name__) opts = docopt(__doc__) rec_ids = opts.get('ID', None) recs = None for k in opts.keys(): fn = k.replace('-', '_') if k.startswith('--') and opts[k] and fn in globals(): f = globals()[fn] recs = f(rec_ids) if recs is None: _log.error('could not find an operation to perform') for rec in recs:
<zip> The zip file to read. <output> The output file to write. """ from bookdata import script_log from docopt import docopt import numpy as np import pandas as pd from io import BytesIO from zipfile import ZipFile opts = docopt(__doc__) _log = script_log(__name__, debug=opts['--verbose']) _log.info("extracting BX rating data") with ZipFile(opts['<zip>'], 'r') as zf: with zf.open('BX-Book-Ratings.csv') as f: data = f.read() _log.info("cleaning BX rating data") barr = np.frombuffer(data, dtype='u1') # delete bytes that are too big barr = barr[barr < 128] # convert to LF barr = barr[barr != ord('\r')] # change delimiter to comma barr[barr == ord(';')] = ord(',') data = bytes(barr)
export.py --work-authors export.py --work-genres export.py --work-ratings export.py --work-actions """ from pathlib import Path from docopt import docopt import pandas as pd import pyarrow as pa import pyarrow.parquet as pq from bookdata import script_log from bookdata import db _log = script_log('export-goodreads') def export_book_ids(): query = ''' SELECT gr_book_rid, gr_book_id, gr_work_id, cluster FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id) ORDER BY gr_book_rid ''' with db.connect() as dbc: _log.info('reading book IDs') books = db.load_table(dbc, query) csv_fn = 'gr-book-ids.csv.gz' pq_fn = 'gr-book-ids.parquet'
import time import hashlib from pathlib import Path from datetime import timedelta from typing import NamedTuple, List from docopt import docopt import psycopg2, psycopg2.extensions, psycopg2.extras from more_itertools import peekable import sqlparse from bookdata import script_log from bookdata import db, tracking opts = docopt(__doc__) _log = script_log(__name__, opts.get('--verbose')) psycopg2.extensions.set_wait_callback(psycopg2.extras.wait_select) script_file = Path(opts.get('SCRIPT')) tfile = opts.get('-T', None) if tfile: tfile = Path(tfile) else: tfile = script_file.with_suffix('.transcript') stage = opts.get('-s', None) if not stage: stage = script_file.stem
""" Output configuration information. Usage: config.py --database (--url | --env) """ from docopt import docopt from bookdata import db from bookdata import script_log _log = script_log(__file__) def _print_env(src, attr, var): val = getattr(src, attr, None) if val is not None: print(f"export {var}='{val}'") def db_config(opts): cfg = db.DBConfig.load() if opts['--url']: print(cfg.url()) elif opts['--env']: _print_env(cfg, 'host', 'PGHOST') _print_env(cfg, 'port', 'PGPORT') _print_env(cfg, 'database', 'PGDATABASE') _print_env(cfg, 'user', 'PGUSER') _print_env(cfg, 'password', 'PGPASSWORD')