def export_work_ratings(): query = ''' SELECT gr_user_rid AS user, gr_work_id AS item, MEDIAN(rating) AS rating, (array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating, MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp, COUNT(rating) AS nratings FROM gr.interaction JOIN gr.book_ids USING (gr_book_id) WHERE rating > 0 AND gr_work_id IS NOT NULL GROUP BY gr_user_rid, gr_work_id ORDER BY MIN(date_updated) ''' with db.connect() as dbc: _log.info('reading book ratings') ratings = db.load_table(dbc, query, dtype={ 'user': '******', 'item': 'i4', 'rating': 'f4', 'last_rating': 'f4', 'nratings': 'i4' }) _log.info('writing ratings') ratings.to_parquet('gr-work-ratings.parquet', index=False, compression='zstd', compression_level=5)
def cluster(txout): "Cluster ISBNs" with db.connect() as dbc, dbc: tracking.begin_stage(dbc, 'cluster') with db.engine().connect() as cxn: _log.info('loading graph') gl = GraphLoader() g = gl.load_graph(cxn, False) print('NODES', g.num_vertices(), file=txout) print('EDGES', g.num_edges(), file=txout) _log.info('finding connected components') comps, hist = label_components(g) _log.info('found %d components, largest has %s items', len(hist), np.max(hist)) print('COMPONENTS', len(hist), file=txout) _log.info('saving cluster records to database') is_isbn = g.vp.source.a == ns_isbn.code clusters = pd.DataFrame({ 'isbn_id': g.vp.label.a[is_isbn], 'cluster': comps.a[is_isbn] }) _import_clusters(dbc, clusters) _log.info('saving ID graph') g.vp['cluster'] = comps g.save('data/id-graph.gt') c_hash = _hash_frame(clusters) print('WRITE CLUSTERS', c_hash, file=txout) tracking.end_stage(dbc, 'cluster', c_hash)
def export_work_actions(): query = ''' SELECT gr_user_rid AS user, gr_work_id AS item, COUNT(rating) AS nactions, MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time, MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time FROM gr.interaction JOIN gr.book_ids USING (gr_book_id) WHERE gr_work_id IS NOT NULL GROUP BY gr_user_rid, gr_work_id ORDER BY MIN(date_updated) ''' with db.connect() as dbc: _log.info('reading book shelf actions') actions = db.load_table(dbc, query, dtype={ 'user': '******', 'item': 'i4', 'nactions': 'i4' }) _log.info('writing actions') actions.to_parquet('gr-work-actions.parquet', index=False, compression='zstd', compression_level=5)
def export_work_titles(): query = f''' SELECT gr_work_id AS work_id, gr_work_rid, work_title FROM gr.work_title ORDER BY gr_work_rid ''' with db.connect() as dbc: _log.info('reading work titles') books = db.load_table(dbc, query) pq_fn = 'gr-work-titles.parquet' _log.info('writing parquet to %s', pq_fn) books.to_parquet(pq_fn, index=False) _log.info('writing CSV') books.to_csv('gr-work-titles.csv.gz', index=False)
def export_work_genres(): query = f''' SELECT gr_work_id AS work_id, genre, sum(score::int) AS score FROM gr.book_ids JOIN gr.book_genres USING (gr_book_rid) GROUP BY work_id, genre ORDER BY work_id, genre ''' with db.connect() as dbc: _log.info('reading work genres') genres = db.load_table(dbc, query) pq_fn = 'gr-work-genres.parquet' _log.info('writing parquet to %s', pq_fn) genres.to_parquet(pq_fn, index=False, compression='brotli') _log.info('writing CSV') genres.to_csv('gr-work-genres.csv.gz', index=False)
def export_book_ids(): query = ''' SELECT gr_book_rid, gr_book_id, gr_work_id, cluster FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id) ORDER BY gr_book_rid ''' with db.connect() as dbc: _log.info('reading book IDs') books = db.load_table(dbc, query) csv_fn = 'gr-book-ids.csv.gz' pq_fn = 'gr-book-ids.parquet' _log.info('writing CSV to %s', csv_fn) books.to_csv(csv_fn, index=False) _log.info('writing parquet to %s', pq_fn) books.to_parquet(pq_fn, index=False)
def export_work_authors(): query = f''' WITH pairs AS (SELECT DISTINCT gr_work_id AS work_id, gr_author_id FROM gr.book_ids JOIN gr.book_authors USING (gr_book_rid) WHERE author_role = '' AND gr_work_id IS NOT NULL) SELECT work_id, gr_author_id AS author_id, author_name FROM pairs JOIN gr.author_info USING (gr_author_id) ORDER BY work_id ''' with db.connect() as dbc: _log.info('reading work authors') books = db.load_table(dbc, query) pq_fn = 'gr-work-authors.parquet' _log.info('writing parquet to %s', pq_fn) books.to_parquet(pq_fn, index=False, compression='brotli') _log.info('writing CSV') books.to_csv('gr-work-authors.csv.gz', index=False)
def __call__(self, ids): with db.connect() as dbc, dbc.cursor() as cur: if ids: return [self._id_rec(cur, r) for r in ids] else: return [self._top_rec(cur)]
if tfile: tfile = Path(tfile) else: tfile = script_file.with_suffix('.transcript') stage = opts.get('-s', None) if not stage: stage = script_file.stem _log.info('reading %s', script_file) script = db.SqlScript(script_file) _log.info('%s has %d chunks', script_file, len(script.chunks)) if opts.get('--dry-run'): script.describe() else: with tfile.open('w') as txf, db.connect() as dbc: key = hashlib.md5() with dbc, dbc.cursor() as cur: tracking.begin_stage(cur, stage) for dep in script.deps: dhs = tracking.record_dep(cur, stage, dep) # hash the dependency hashes for d, h in dhs: key.update(h.encode('utf-8')) h = tracking.hash_and_record_file(cur, script_file, stage) # hash the source file key.update(h.encode('utf-8')) script.execute(dbc, transcript=txf) with dbc, dbc.cursor() as cur: for ns, tbl in script.tables: oid, kind = tracking.record_tbl(cur, stage, ns, tbl)
in_chk = hashlib.sha1(data).hexdigest() barr = np.frombuffer(data, dtype='u1') # delete bytes that are too big barr = barr[barr < 128] # convert to LF barr = barr[barr != ord('\r')] # change delimiter to comma barr[barr == ord(';')] = ord(',') # write _log.info('importing BX to database') data = bytes(barr) rd = StringIO(data.decode('utf8')) with db.connect() as dbc: print('IMPORT TO bx.raw_ratings', file=tx_file) print('READ', src_file, in_chk, file=tx_file) # we're going to hash the data we insert dh = hashlib.md5() # with dbc encapsulates a transaction with dbc, dbc.cursor() as cur: tracking.begin_stage(cur, 'bx-ratings') tracking.record_file(cur, src_file, in_chk, 'bx-ratings') tracking.record_dep(cur, 'bx-ratings', 'bx-schema') n = 0 for row in tqdm(csv.DictReader(rd)): uid = row['User-ID'] isbn = row['ISBN'] rating = row['Book-Rating'] cur.execute(