Beispiel #1
0
def export_loc_books():
    "export LOC books"
    query = f'''
        SELECT DISTINCT cluster AS item
        FROM locmds.book_rec_isbn JOIN isbn_cluster USING (isbn_id)
    '''
    _log.info('reading LOC books')
    books = pd.read_sql(query, db_uri())
    csv_fn = data_dir / 'loc-books.csv.gz'
    pq_fn = data_dir / 'loc-books.parquet'
    _log.info('writing CSV to %s', csv_fn)
    books.to_csv(csv_fn, index=False)
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False, compression='snappy')
Beispiel #2
0
def export_authors():
    query = f'''
        SELECT cluster AS item, gender
        FROM cluster_first_author_gender
        ORDER BY cluster
    '''
    _log.info('reading author genders')
    gender = pd.read_sql(query, db_uri())
    csv_fn = data_dir / 'author-gender.csv.gz'
    pq_fn = data_dir / 'author-gender.parquet'
    _log.info('writing CSV to %s', csv_fn)
    gender.to_csv(csv_fn, index=False)
    _log.info('writing parquet to %s', pq_fn)
    gender.to_parquet(pq_fn, index=False, compression='snappy')
Beispiel #3
0
def export_book_ids():
    query = f'''
        SELECT gr_book_rid, gr_book_id, gr_work_id, cluster
        FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
        ORDER BY gr_book_rid
    '''
    _log.info('reading book IDs')
    books = pd.read_sql(query, db_uri())
    csv_fn = data_dir / 'gr-book-ids.csv.gz'
    pq_fn = data_dir / 'gr-book-ids.parquet'
    _log.info('writing CSV to %s', csv_fn)
    books.to_csv(csv_fn, index=False)
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False, compression='gzip')
Beispiel #4
0
def export_book_hashes():
    query = f'''
        SELECT cluster AS item, COUNT(isbn) AS nisbns,
            MD5(STRING_AGG(isbn, '|' ORDER BY isbn))
        FROM isbn_cluster JOIN isbn_id USING (isbn_id)
        GROUP BY cluster
    '''
    _log.info('reading book ID hashes')
    gender = pd.read_sql(query, db_uri())
    csv_fn = data_dir / 'book-hash.csv.gz'
    pq_fn = data_dir / 'book-hash.parquet'
    _log.info('writing CSV to %s', csv_fn)
    gender.to_csv(csv_fn, index=False)
    _log.info('writing parquet to %s', pq_fn)
    gender.to_parquet(pq_fn, index=False, compression='snappy')
Beispiel #5
0
def sample(options):
    data = dt.fname(options.data)

    seed = init_rng(rng_seed(), 'sample-users', data)
    _log.info('using random seed %s', seed)

    ds = dt.datasets[data]

    kr_query = f'''
        SELECT r.user_id AS user, COUNT(book_id) AS profile_size
        FROM {ds.table} r
        JOIN cluster_first_author_gender g ON g.cluster = r.book_id
        WHERE gender = 'male' OR gender = 'female'
        GROUP BY r.user_id
        HAVING COUNT(book_id) >= {options.min_ratings}
    '''

    _log.info('loading users for %s', data)
    valid_users = pd.read_sql(kr_query, db_uri())
    _log.info('found %d viable profiles, sampling %d',
              len(valid_users), options.sample_size)
    sample = valid_users.sample(options.sample_size, random_state=rng(legacy=True))

    ddir = data_dir / data

    u_fn = ddir / 'sample-users.csv'
    _log.info('writing %s', u_fn)
    sample.to_csv(ddir / 'sample-users.csv', index=False)

    ratings = pd.read_parquet(ddir / 'ratings.parquet')
    ratings = pd.merge(sample[['user']], ratings)
    r_fn = ddir / 'sample-ratings.csv'
    _log.info('writing %d ratings to %s', len(ratings), r_fn)
    ratings.to_csv(r_fn, index=False)

    s_fn = ddir / 'sample-stats.json'
    _log.info('writing stats to %s', s_fn)
    s_fn.write_text(json.dumps({
        'viable': len(valid_users),
        'sampled': options.sample_size,
        'ratings': len(ratings)
    }))