def export_loc_books(): "export LOC books" query = f''' SELECT DISTINCT cluster AS item FROM locmds.book_rec_isbn JOIN isbn_cluster USING (isbn_id) ''' _log.info('reading LOC books') books = pd.read_sql(query, db_uri()) csv_fn = data_dir / 'loc-books.csv.gz' pq_fn = data_dir / 'loc-books.parquet' _log.info('writing CSV to %s', csv_fn) books.to_csv(csv_fn, index=False) _log.info('writing parquet to %s', pq_fn) books.to_parquet(pq_fn, index=False, compression='snappy')
def export_authors(): query = f''' SELECT cluster AS item, gender FROM cluster_first_author_gender ORDER BY cluster ''' _log.info('reading author genders') gender = pd.read_sql(query, db_uri()) csv_fn = data_dir / 'author-gender.csv.gz' pq_fn = data_dir / 'author-gender.parquet' _log.info('writing CSV to %s', csv_fn) gender.to_csv(csv_fn, index=False) _log.info('writing parquet to %s', pq_fn) gender.to_parquet(pq_fn, index=False, compression='snappy')
def export_book_ids(): query = f''' SELECT gr_book_rid, gr_book_id, gr_work_id, cluster FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id) ORDER BY gr_book_rid ''' _log.info('reading book IDs') books = pd.read_sql(query, db_uri()) csv_fn = data_dir / 'gr-book-ids.csv.gz' pq_fn = data_dir / 'gr-book-ids.parquet' _log.info('writing CSV to %s', csv_fn) books.to_csv(csv_fn, index=False) _log.info('writing parquet to %s', pq_fn) books.to_parquet(pq_fn, index=False, compression='gzip')
def export_book_hashes(): query = f''' SELECT cluster AS item, COUNT(isbn) AS nisbns, MD5(STRING_AGG(isbn, '|' ORDER BY isbn)) FROM isbn_cluster JOIN isbn_id USING (isbn_id) GROUP BY cluster ''' _log.info('reading book ID hashes') gender = pd.read_sql(query, db_uri()) csv_fn = data_dir / 'book-hash.csv.gz' pq_fn = data_dir / 'book-hash.parquet' _log.info('writing CSV to %s', csv_fn) gender.to_csv(csv_fn, index=False) _log.info('writing parquet to %s', pq_fn) gender.to_parquet(pq_fn, index=False, compression='snappy')
def sample(options): data = dt.fname(options.data) seed = init_rng(rng_seed(), 'sample-users', data) _log.info('using random seed %s', seed) ds = dt.datasets[data] kr_query = f''' SELECT r.user_id AS user, COUNT(book_id) AS profile_size FROM {ds.table} r JOIN cluster_first_author_gender g ON g.cluster = r.book_id WHERE gender = 'male' OR gender = 'female' GROUP BY r.user_id HAVING COUNT(book_id) >= {options.min_ratings} ''' _log.info('loading users for %s', data) valid_users = pd.read_sql(kr_query, db_uri()) _log.info('found %d viable profiles, sampling %d', len(valid_users), options.sample_size) sample = valid_users.sample(options.sample_size, random_state=rng(legacy=True)) ddir = data_dir / data u_fn = ddir / 'sample-users.csv' _log.info('writing %s', u_fn) sample.to_csv(ddir / 'sample-users.csv', index=False) ratings = pd.read_parquet(ddir / 'ratings.parquet') ratings = pd.merge(sample[['user']], ratings) r_fn = ddir / 'sample-ratings.csv' _log.info('writing %d ratings to %s', len(ratings), r_fn) ratings.to_csv(r_fn, index=False) s_fn = ddir / 'sample-stats.json' _log.info('writing stats to %s', s_fn) s_fn.write_text(json.dumps({ 'viable': len(valid_users), 'sampled': options.sample_size, 'ratings': len(ratings) }))