Beispiel #1
0
    def _top_rec(self, cur):
        t = self.type
        _log.info('fetching one %s', t)
        q = f'SELECT {t}_data FROM ol.{t} LIMIT 1'
        cur.execute(q)
        data, = cur.fetchone()
        _log.debug('got %r', data)
        return data


__gr_work = GR('work')
__gr_book = GR('book')
__ol_edition = OL('edition')
__ol_work = OL('work')

_log = script_log(__name__)
opts = docopt(__doc__)

rec_ids = opts.get('ID', None)

recs = None
for k in opts.keys():
    fn = k.replace('-', '_')
    if k.startswith('--') and opts[k] and fn in globals():
        f = globals()[fn]
        recs = f(rec_ids)

if recs is None:
    _log.error('could not find an operation to perform')

for rec in recs:
    <zip>
        The zip file to read.
    <output>
        The output file to write.
"""

from bookdata import script_log
from docopt import docopt

import numpy as np
import pandas as pd
from io import BytesIO
from zipfile import ZipFile

opts = docopt(__doc__)
_log = script_log(__name__, debug=opts['--verbose'])

_log.info("extracting BX rating data")
with ZipFile(opts['<zip>'], 'r') as zf:
    with zf.open('BX-Book-Ratings.csv') as f:
        data = f.read()

_log.info("cleaning BX rating data")
barr = np.frombuffer(data, dtype='u1')
# delete bytes that are too big
barr = barr[barr < 128]
# convert to LF
barr = barr[barr != ord('\r')]
# change delimiter to comma
barr[barr == ord(';')] = ord(',')
data = bytes(barr)
    export.py --work-authors
    export.py --work-genres
    export.py --work-ratings
    export.py --work-actions
"""

from pathlib import Path
from docopt import docopt
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

from bookdata import script_log
from bookdata import db

_log = script_log('export-goodreads')


def export_book_ids():
    query = '''
        SELECT gr_book_rid, gr_book_id, gr_work_id, cluster
        FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
        ORDER BY gr_book_rid
    '''

    with db.connect() as dbc:
        _log.info('reading book IDs')
        books = db.load_table(dbc, query)

    csv_fn = 'gr-book-ids.csv.gz'
    pq_fn = 'gr-book-ids.parquet'
Beispiel #4
0
import time
import hashlib
from pathlib import Path
from datetime import timedelta
from typing import NamedTuple, List
from docopt import docopt

import psycopg2, psycopg2.extensions, psycopg2.extras
from more_itertools import peekable
import sqlparse

from bookdata import script_log
from bookdata import db, tracking

opts = docopt(__doc__)
_log = script_log(__name__, opts.get('--verbose'))

psycopg2.extensions.set_wait_callback(psycopg2.extras.wait_select)

script_file = Path(opts.get('SCRIPT'))

tfile = opts.get('-T', None)
if tfile:
    tfile = Path(tfile)
else:
    tfile = script_file.with_suffix('.transcript')

stage = opts.get('-s', None)
if not stage:
    stage = script_file.stem
Beispiel #5
0
"""
Output configuration information.

Usage:
    config.py --database (--url | --env)
"""

from docopt import docopt

from bookdata import db
from bookdata import script_log

_log = script_log(__file__)


def _print_env(src, attr, var):
    val = getattr(src, attr, None)
    if val is not None:
        print(f"export {var}='{val}'")


def db_config(opts):
    cfg = db.DBConfig.load()
    if opts['--url']:
        print(cfg.url())
    elif opts['--env']:
        _print_env(cfg, 'host', 'PGHOST')
        _print_env(cfg, 'port', 'PGPORT')
        _print_env(cfg, 'database', 'PGDATABASE')
        _print_env(cfg, 'user', 'PGUSER')
        _print_env(cfg, 'password', 'PGPASSWORD')