Example #1
0
def get_author_info(author_id):
    author_obj = goodreads.get_object('author', author_id)
    author_info = dict(author_id=author_id)
    dict_fields = ['gender', 'born_at', 'died_at']
    for field in dict_fields:
        author_info[field] = author_obj[field]

    #often there's multiple spaces between first and last name
    author_info['name'] = re.sub( '\s+', ' ', author_info['name']).strip();
    author_info['birth_year'] = None
    author_info['birth_month'] = None
    author_info['birth_day'] = None
    author_info['death_year'] = None
    author_info['death_month'] = None
    author_info['death_day'] = None

    # Update birth info from the web pages
    if author_obj['born_at'] == None:
        author_info['born_at'] = parse_html_page(author_info['link'], author_info['born_at'])
    #show possible birth date from Wikipedia
    if author_obj['born_at'] == None:
        search_wiki(author_info['name'])

    if author_info['born_at'] is not None:
        bd = author_info['born_at']
        author_info['birth_year'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%Y')
        author_info['birth_month'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%m')
        author_info['birth_day'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%d')
    if author_info['died_at'] is not None:
        dd = author_info['died_at']
        author_info['death_year'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%Y')
        author_info['death_month'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%m')
        author_info['death_day'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%d')

    return author_info
Example #2
0
def update_author_info(author_id):
    author_obj = goodreads.get_object('author', author_id)
    author_info = dict(author_id=author_id)
    print (author_info)
    dict_fields = ['name', 'image_url', 'link']
    for field in dict_fields:
        author_info[field] = author_obj[field]
Example #3
0
def get_book_info(book_id, genre):
    book_obj = goodreads.get_object('book', book_id)
    book_info = dict(book_id=book_id)
    dict_fields = ['title', 'image_url', 'publisher',
                     'num_pages', 'link', 'isbn', 'isbn13',
                     'publication_year', 'publication_month', 'publication_day']
    #check if missing information
    hasEmpty = False
    for field in dict_fields:
        if book_obj[field] == None:
            hasEmpty = True
        book_info[field] = book_obj[field]
    work = book_obj['work']
    #check if the work is same as the books
    if work['best_book_id']['#text'] == book_id:
        work_dict_fields = ['ratings_sum', 'ratings_count',
                              'original_publication_year', 'original_publication_month', 'original_publication_day']
        for field in work_dict_fields:
            if '#text' in work[field]:
                book_info[field] = work[field]['#text']
    if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info['image_url']:
        # update in 2019 -- not working anymore, manually check missing information
        # parse missing data from html page
        # data = parse_html_page(book_info['link'],
        #         book_info['publication_year'],
        #         book_info['publication_month'],
        #         book_info['publication_day'],
        #         book_info['publisher'],
        #         book_info['num_pages'],
        #         book_info['image_url']
        #     )
        # book_info.update(data)
    #no genre info
    if genre == '':
        shelves = list(map(lambda x: x['@name'], book_obj['popular_shelves']['shelf']))
        if ('non-fiction' in shelves) and ('fiction' in shelves):
            book_info['genre'] = 'Nonfiction' if shelves.index('non-fiction') < shelves.index('fiction') else 'Fiction'
        elif ('nonfiction' in shelves) or ('non-fiction' in shelves):
            book_info['genre'] = 'Nonfiction'
        elif ('novels' in shelves) or ('fiction' in shelves):
            book_info['genre'] = 'Fiction'

    return book_info

#load data from the dataset collected manually
start_year = 2019
end_year = 2019
with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f:
    data = []
    for row in csv.DictReader(f):
        print (row['year'], row['book_title'], row['book_id'])
        if int(row['year']) <= start_year and int(row['year']) >= end_year:
            print ('--get book info')
            datum_base = row
            datum = get_book_info(row['book_id'], row['genre'])
            datum_base.update(datum)
            data.append(datum_base)
    print (data)
    goodreads.save_as_csv('book-info', data)
Example #4
0
def update_author_info(author_id):
    author_obj = goodreads.get_object('author', author_id)
    author_info = dict(author_id=author_id)
    dict_fields = ['name', 'image_url', 'link']
    for field in dict_fields:
        author_info[field] = author_obj[field]
    #COMMENT this for the first run, updated with manually added birth and death dates
    update_fields = ['gender', 'birth_year', 'birth_month', 'birth_day', 'death_year', 'death_month', 'death_day']
    for field in update_fields:
        author_info[field] = author_manual[author_id][field]
    return author_info
Example #5
0
def update_author_info(author_id):
    author_obj = goodreads.get_object('author', author_id)
    author_info = dict(author_id=author_id)
    print(author_info)
    dict_fields = ['name', 'image_url', 'link']
    for field in dict_fields:
        author_info[field] = author_obj[field]
    #COMMENT following three lines, this is for the first run and the manual addition of birth and death dates
    # update_fields = ['gender', 'birth_year', 'birth_month', 'birth_day', 'death_year', 'death_month', 'death_day']
    # for field in update_fields:
    #     author_info[field] = author_manual[author_id][field]
    return author_info
Example #6
0
def get_book_info(book_id, genre):
    book_obj = goodreads.get_object('book', book_id)
    book_info = dict(book_id=book_id)
    dict_fields = [
        'title', 'image_url', 'publisher', 'num_pages', 'link', 'isbn',
        'isbn13', 'publication_year'
    ]
    #check if missing information
    hasEmpty = False
    for field in dict_fields:
        if book_obj[field] == None:
            hasEmpty = True
        book_info[field] = book_obj[field]
    work = book_obj['work']
    #check if the work is same as the books
    if work['best_book_id']['#text'] == book_id:
        work_dict_fields = [
            'ratings_sum', 'ratings_count', 'original_publication_year'
        ]
        for field in work_dict_fields:
            if '#text' in work[field]:
                book_info[field] = work[field]['#text']
    # collect top 5 genres related to this book in a list
    top5_genres = []

    for i in book_obj['popular_shelves']['shelf']:
        if i['@name'] in keywords:
            top5_genres.append(i['@name'])
        if len(top5_genres) == 10: break
    book_info['genres'] = top5_genres

    if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info[
            'image_url']:
        #parse missing data from html page
        print('---parse html')
        data = parse_html_page(book_info['link'],
                               book_info['publication_year'],
                               book_info['publisher'], book_info['num_pages'],
                               book_info['image_url'])
        book_info.update(data)
    #no genre info
    #if genre == '':
    #    shelves = list(map(lambda x: x['@name'], book_obj['popular_shelves']['shelf']))
    #    if ('non-fiction' in shelves) and ('fiction' in shelves):
    #        book_info['genre'] = 'Nonfiction' if shelves.index('non-fiction') < shelves.index('fiction') else 'Fiction'
    #    elif ('nonfiction' in shelves) or ('non-fiction' in shelves):
    #        book_info['genre'] = 'Nonfiction'
    #    elif ('novels' in shelves) or ('fiction' in shelves):
    #        book_info['genre'] = 'Fiction'

    return book_info
Example #7
0
def get_book_info(book_id, genre):
    book_obj = goodreads.get_object('book', book_id)
    book_info = dict(book_id=book_id)
    dict_fields = [
        'title', 'image_url', 'publisher', 'num_pages', 'link', 'isbn',
        'isbn13', 'publication_year', 'publication_month', 'publication_day'
    ]
    #check if missing information
    hasEmpty = False
    for field in dict_fields:
        if book_obj[field] == None:
            hasEmpty = True
        book_info[field] = book_obj[field]
    work = book_obj['work']
    #check if the work is same as the books
    if work['best_book_id']['#text'] == book_id:
        work_dict_fields = [
            'ratings_sum', 'ratings_count', 'original_publication_year',
            'original_publication_month', 'original_publication_day'
        ]
        for field in work_dict_fields:
            if '#text' in work[field]:
                book_info[field] = work[field]['#text']
    # if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info['image_url']:
    # update in 2019 -- not working anymore, manually check missing information
    # parse missing data from html page
    # data = parse_html_page(book_info['link'],
    #         book_info['publication_year'],
    #         book_info['publication_month'],
    #         book_info['publication_day'],
    #         book_info['publisher'],
    #         book_info['num_pages'],
    #         book_info['image_url']
    #     )
    # book_info.update(data)
    #no genre info
    if genre == '':
        shelves = list(
            map(lambda x: x['@name'], book_obj['popular_shelves']['shelf']))
        if ('non-fiction' in shelves) and ('fiction' in shelves):
            book_info['genre'] = 'Nonfiction' if shelves.index(
                'non-fiction') < shelves.index('fiction') else 'Fiction'
        elif ('nonfiction' in shelves) or ('non-fiction' in shelves):
            book_info['genre'] = 'Nonfiction'
        elif ('novels' in shelves) or ('fiction' in shelves):
            book_info['genre'] = 'Fiction'

    return book_info
Example #8
0
def get_author_info(author_id):
    author_obj = goodreads.get_object('author', author_id)
    author_info = dict(author_id=author_id)
    dict_fields = ['name', 'gender', 'born_at', 'died_at', 'link', 'image_url']
    for field in dict_fields:
        author_info[field] = author_obj[field]
    #often there's multiple spaces between first and last name
    author_info['name'] = re.sub( '\s+', ' ', author_info['name']).strip()
    author_info['birth_year'] = None
    author_info['birth_month'] = None
    author_info['birth_day'] = None
    author_info['death_year'] = None
    author_info['death_month'] = None
    author_info['death_day'] = None

    # 2019 update - wikipedia page doesn't work :(
    # # Update birth info from the web pages
    # if author_obj['born_at'] == None:
    #     author_info['born_at'] = parse_html_page(author_info['link'], author_info['born_at'])
    # #show possible birth date from Wikipedia
    # if author_obj['born_at'] == None:
    #     search_wiki(author_info['name'])
    #
    # if author_info['born_at'] is not None:
    #     bd = author_info['born_at']
    #     author_info['birth_year'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%Y')
    #     author_info['birth_month'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%m')
    #     author_info['birth_day'] = datetime.strptime(bd, '%Y/%m/%d').strftime('%d')
    # if author_info['died_at'] is not None:
    #     dd = author_info['died_at']
    #     author_info['death_year'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%Y')
    #     author_info['death_month'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%m')
    #     author_info['death_day'] = datetime.strptime(dd, '%Y/%m/%d').strftime('%d')
    #
    print (author_info)
    return author_info