Ejemplo n.º 1
0
def export_work_ratings():
    query = '''
    SELECT gr_user_rid AS user, gr_work_id AS item,
            MEDIAN(rating) AS rating,
            (array_agg(rating ORDER BY date_updated DESC))[1] AS last_rating,
            MEDIAN(EXTRACT(EPOCH FROM date_updated)) AS timestamp,
            COUNT(rating) AS nratings
     FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
     WHERE rating > 0 AND gr_work_id IS NOT NULL
     GROUP BY gr_user_rid, gr_work_id
     ORDER BY MIN(date_updated)
    '''

    with db.connect() as dbc:
        _log.info('reading book ratings')
        ratings = db.load_table(dbc,
                                query,
                                dtype={
                                    'user': '******',
                                    'item': 'i4',
                                    'rating': 'f4',
                                    'last_rating': 'f4',
                                    'nratings': 'i4'
                                })

    _log.info('writing ratings')
    ratings.to_parquet('gr-work-ratings.parquet',
                       index=False,
                       compression='zstd',
                       compression_level=5)
Ejemplo n.º 2
0
def export_work_actions():
    query = '''
    SELECT gr_user_rid AS user, gr_work_id AS item,
            COUNT(rating) AS nactions,
            MIN(EXTRACT(EPOCH FROM date_updated)) AS first_time,
            MAX(EXTRACT(EPOCH FROM date_updated)) AS last_time
     FROM gr.interaction JOIN gr.book_ids USING (gr_book_id)
     WHERE gr_work_id IS NOT NULL
     GROUP BY gr_user_rid, gr_work_id
     ORDER BY MIN(date_updated)
    '''

    with db.connect() as dbc:
        _log.info('reading book shelf actions')
        actions = db.load_table(dbc,
                                query,
                                dtype={
                                    'user': '******',
                                    'item': 'i4',
                                    'nactions': 'i4'
                                })

    _log.info('writing actions')
    actions.to_parquet('gr-work-actions.parquet',
                       index=False,
                       compression='zstd',
                       compression_level=5)
Ejemplo n.º 3
0
def export_work_titles():
    query = f'''
        SELECT gr_work_id AS work_id, gr_work_rid, work_title
        FROM gr.work_title
        ORDER BY gr_work_rid
    '''

    with db.connect() as dbc:
        _log.info('reading work titles')
        books = db.load_table(dbc, query)

    pq_fn = 'gr-work-titles.parquet'
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False)
    _log.info('writing CSV')
    books.to_csv('gr-work-titles.csv.gz', index=False)
Ejemplo n.º 4
0
def export_work_genres():
    query = f'''
        SELECT gr_work_id AS work_id, genre, sum(score::int) AS score
        FROM gr.book_ids JOIN gr.book_genres USING (gr_book_rid)
        GROUP BY work_id, genre
        ORDER BY work_id, genre
    '''

    with db.connect() as dbc:
        _log.info('reading work genres')
        genres = db.load_table(dbc, query)

    pq_fn = 'gr-work-genres.parquet'
    _log.info('writing parquet to %s', pq_fn)
    genres.to_parquet(pq_fn, index=False, compression='brotli')
    _log.info('writing CSV')
    genres.to_csv('gr-work-genres.csv.gz', index=False)
Ejemplo n.º 5
0
def export_book_ids():
    query = '''
        SELECT gr_book_rid, gr_book_id, gr_work_id, cluster
        FROM gr.book_ids JOIN gr.book_cluster USING (gr_book_id)
        ORDER BY gr_book_rid
    '''

    with db.connect() as dbc:
        _log.info('reading book IDs')
        books = db.load_table(dbc, query)

    csv_fn = 'gr-book-ids.csv.gz'
    pq_fn = 'gr-book-ids.parquet'
    _log.info('writing CSV to %s', csv_fn)
    books.to_csv(csv_fn, index=False)
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False)
Ejemplo n.º 6
0
def export_work_authors():
    query = f'''
        WITH
            pairs AS (SELECT DISTINCT gr_work_id AS work_id, gr_author_id
                      FROM gr.book_ids JOIN gr.book_authors USING (gr_book_rid)
                      WHERE author_role = '' AND gr_work_id IS NOT NULL)
        SELECT work_id, gr_author_id AS author_id, author_name
        FROM pairs JOIN gr.author_info USING (gr_author_id)
        ORDER BY work_id
    '''

    with db.connect() as dbc:
        _log.info('reading work authors')
        books = db.load_table(dbc, query)

    pq_fn = 'gr-work-authors.parquet'
    _log.info('writing parquet to %s', pq_fn)
    books.to_parquet(pq_fn, index=False, compression='brotli')
    _log.info('writing CSV')
    books.to_csv('gr-work-authors.csv.gz', index=False)