def get_book_info(book_id, genre): book_obj = goodreads.get_object('book', book_id) book_info = dict(book_id=book_id) dict_fields = ['title', 'image_url', 'publisher', 'num_pages', 'link', 'isbn', 'isbn13', 'publication_year', 'publication_month', 'publication_day'] #check if missing information hasEmpty = False for field in dict_fields: if book_obj[field] == None: hasEmpty = True book_info[field] = book_obj[field] work = book_obj['work'] #check if the work is same as the books if work['best_book_id']['#text'] == book_id: work_dict_fields = ['ratings_sum', 'ratings_count', 'original_publication_year', 'original_publication_month', 'original_publication_day'] for field in work_dict_fields: if '#text' in work[field]: book_info[field] = work[field]['#text'] if hasEmpty and book_info['link'] != '' or 'nophoto/book/111' in book_info['image_url']: # update in 2019 -- not working anymore, manually check missing information # parse missing data from html page # data = parse_html_page(book_info['link'], # book_info['publication_year'], # book_info['publication_month'], # book_info['publication_day'], # book_info['publisher'], # book_info['num_pages'], # book_info['image_url'] # ) # book_info.update(data) #no genre info if genre == '': shelves = list(map(lambda x: x['@name'], book_obj['popular_shelves']['shelf'])) if ('non-fiction' in shelves) and ('fiction' in shelves): book_info['genre'] = 'Nonfiction' if shelves.index('non-fiction') < shelves.index('fiction') else 'Fiction' elif ('nonfiction' in shelves) or ('non-fiction' in shelves): book_info['genre'] = 'Nonfiction' elif ('novels' in shelves) or ('fiction' in shelves): book_info['genre'] = 'Fiction' return book_info #load data from the dataset collected manually start_year = 2019 end_year = 2019 with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f: data = [] for row in csv.DictReader(f): print (row['year'], row['book_title'], row['book_id']) if int(row['year']) <= start_year and int(row['year']) >= end_year: print ('--get book info') datum_base = row datum = get_book_info(row['book_id'], row['genre']) datum_base.update(datum) data.append(datum_base) print (data) goodreads.save_as_csv('book-info', data)
# print (author_info) return author_info # get author ids author_ids = [] with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f: for row in csv.DictReader(f): if not row['author_id'] in author_ids: author_ids.append(row['author_id']) if row['author2_id'] != '' and not row['author2_id'] in author_ids: author_ids.append(row['author2_id']) # check author_ids parsed ids already_parsed = [] with open('csv/author-info.csv', newline='', encoding='utf-8', errors='ignore') as f: for row in csv.DictReader(f): already_parsed.append(row['author_id']) # get newly added author info data = [] for id in author_ids: if not id in already_parsed: print ('--get author info', id) if has_manual_data: datum = update_author_info(id) else: datum = get_author_info(id) data.append(datum) goodreads.save_as_csv('author-info', data)
#no genre info if genre == '': shelves = list( map(lambda x: x['@name'], book_obj['popular_shelves']['shelf'])) if ('non-fiction' in shelves) and ('fiction' in shelves): book_info['genre'] = 'Nonfiction' if shelves.index( 'non-fiction') < shelves.index('fiction') else 'Fiction' elif ('nonfiction' in shelves) or ('non-fiction' in shelves): book_info['genre'] = 'Nonfiction' elif ('novels' in shelves) or ('fiction' in shelves): book_info['genre'] = 'Fiction' return book_info #load data from the dataset collected manually start_year = 2018 end_year = 2018 with open('csv/goodreads-ids.csv', newline='', encoding='latin-1') as f: data = [] for row in csv.DictReader(f): print(row['year'], row['book_title'], row['book_id']) if int(row['year']) <= start_year and int(row['year']) >= end_year: print('--get book info') datum_base = row datum = get_book_info(row['book_id'], row['genre']) datum_base.update(datum) data.append(datum_base) print(data) goodreads.save_as_csv('book-info', data)
if search_count > 0: if search_count == 1: r = obj['GoodreadsResponse']['search']['results']['work'][ 'best_book'] else: r = find_best_match( obj['GoodreadsResponse']['search']['results']['work'], author, title, r) if r != '__N/A__': book_id = r['id']['#text'] author_id = r['author']['id']['#text'] else: print('----- no matching author or title id', author, title) return dict(book_id=book_id, author_id=author_id) #load data from the dataset collected manually start_year = 2018 end_year = 2018 with open('csv/nytimes-best-books.csv', newline='', encoding='latin-1') as f: data = [] for row in csv.DictReader(f): print(row['year'], row['author_name'], row['book_title']) if int(row['year']) <= start_year and int(row['year']) >= end_year: datum_base = row datum = search_by_author_and_book(row['book_title'], row['author_name']) datum_base.update(datum) data.append(datum_base) goodreads.save_as_csv('goodreads-ids', data)
if search_count > 0: if search_count == 1: r = obj['GoodreadsResponse']['search']['results']['work'][ 'best_book'] else: r = find_best_match( obj['GoodreadsResponse']['search']['results']['work'], author, title, r) if r != '__N/A__': book_id = r['id']['#text'] author_id = r['author']['id']['#text'] else: print('----- no matching author or title id', author, title) return dict(book_id=book_id, author_id=author_id) #load data from the dataset collected manually start_year = 2014 end_year = 2017 with open('data/NYT_fiction.csv', newline='') as f: data = [] for row in csv.DictReader(f): print(row['year'], row['author_name'], row['book_title']) if int(row['year']) >= start_year and int(row['year']) <= end_year: datum_base = row datum = search_by_author_and_book(row['book_title'], row['author_name']) datum_base.update(datum) data.append(datum_base) goodreads.save_as_csv('fiction_goodreads-ids', data)
book_id = "" author_id = "" if search_count > 0: if search_count == 1: r = obj["GoodreadsResponse"]["search"]["results"]["work"]["best_book"] else: r = find_best_match( obj["GoodreadsResponse"]["search"]["results"]["work"], author, title, r ) if r != "__N/A__": book_id = r["id"]["#text"] author_id = r["author"]["id"]["#text"] else: print("----- no matching author or title id", author, title) return dict(book_id=book_id, author_id=author_id) # load data from the dataset collected manually start_year = 2019 end_year = 2019 with open("csv/nytimes-best-books.csv", newline="", encoding="latin-1") as f: data = [] for row in csv.DictReader(f): print(row["year"], row["author_name"], row["book_title"]) if int(row["year"]) <= start_year and int(row["year"]) >= end_year: datum_base = row datum = search_by_author_and_book(row["book_title"], row["author_name"]) datum_base.update(datum) data.append(datum_base) goodreads.save_as_csv("goodreads-ids", data)