def get_sections(soup, pane_name, base_url, archived=False, update_old=False): summaries = None for link in soup.find(class_=pane_name).findAll('li'): if 'summary and analysis' in link.text.lower().strip(): summaries = link break sections = [] try: for link in summaries.findAll('li'): name = link.text.strip() url = urllib.parse.urljoin(base_url, link.a['href']) if archived: orig_url = get_orig_url(link.a['href']) url = get_archived(orig_url, update_old) sections.append((name, url)) except AttributeError: pass if len(sections) == 0: try: name = summaries.text.strip() url = urllib.parse.urljoin(base_url, summaries.a['href']) if archived: orig_url = get_orig_url(summaries.a['href']) url = get_archived(orig_url, update_old) sections.append((name, url)) except: pass return sections
def get_plot_summary(soup, pane_name, base_url, archived, update_old): summaries = [] for link in soup.find(class_=pane_name).findAll('li'): if 'summary' in link.text.lower(): summaries.append(link) if len(summaries) > 1: # Assumption first one is overall book/play summary. Hold for most cases if archived: orig_url = get_orig_url(summaries[0].a['href']) link = get_archived(orig_url, update_old) else: link = urllib.parse.urljoin(base_url, summaries[0].a['href']) plot_summary = get_section_summary(link) return plot_summary else: return None
def get_sections(soup, base_url, archived=False, update_old=False, pane_name=PANE_NAME): sections = [] for link in soup.find(class_=pane_name).findAll('li'): if 'summary-and-analysis' in link.a['href']: sections.append(link) section_urls = [] for section in sections[1:]: name = section.span.text url = urllib.parse.urljoin(base_url, section.a['href']) if archived: url = get_archived(get_orig_url(url), update_old) section_urls.append((name, url)) return section_urls
def get_plot_section_urls(url, base_url=BASE_URL, archived=False, update_old=False, sleep=SLEEP): soup = get_soup(url, sleep=sleep) plot_url = urllib.parse.urljoin(url, 'summary/') status_code = requests.get(plot_url).status_code if status_code in set([404, 500]): try: plot_url = get_archived(plot_url) except requests.exceptions.HTTPError as e: print('plot url not found at', plot_url) plot_url = '' section_urls = [] seen = set() # lists = soup.find_all(class_='lists') # litems = lists[0].find_all('li') lists = soup.find_all(class_='landing-page__umbrella__section__list') litems = lists[1].find_all('li') if not litems: litems = lists[1].findAll('li') if len(litems) == 1: litems = lists[2].findAll('li') for item in litems: if not item.a: continue href = item.a.get('href') # if not href: # pass if 'section' in href: if archived: orig_url = get_orig_url(href) url = get_archived(orig_url, update_old) else: url = urllib.parse.urljoin(base_url, item.a['href']) orig_url = url if orig_url not in seen: section_urls.append(url) seen.add(orig_url) return plot_url, section_urls
def process_story_link(link, archived, update_old): soup = get_soup(link) stories = soup.find_all('a', text=RE_STORY) if not stories: return None section_summs = [] for story in stories: # a story page has multiple chapters href = story.get('href') ## For page http://www.pinkmonkey.com/booknotes/barrons/billbud.asp , we want Typee, but not Billy Budd if not href or href.startswith('billbud'): continue if archived: url = urllib.parse.urljoin(get_orig_url(link), href) url = get_archived(url, update_old) else: url = urllib.parse.urljoin(link, href) summs = process_story(url) if summs: section_summs.extend(summs) return section_summs
def get_plot_summary(soup, base_url, archived=False, update_old=False, pane_name=PANE_NAME): summaries = [] for link in soup.find(class_=pane_name).findAll('li'): if 'summary' in link.text.lower(): summaries.append(link) if len(summaries) > 1: # Assume that first one is overall book/play summary. Hold for most cases href = summaries[0].a['href'] link = urllib.parse.urljoin(base_url, href) if archived: link = get_archived(get_orig_url(href), update_old) plot_summary = get_section_summary(link, base_url, archived, update_old) return plot_summary else: return None
def get_summaries(books_list, base_url, out_name, use_pickled=False, archived=False, title_set=None, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() soup = get_soup(books_list) title_url_map = {} for book in soup.find(class_='content active').findAll('li'): title = book.find('h4').text.strip() if title_set and title not in title_set: continue url = urllib.parse.urljoin(BASE_URL, book.a['href']) title_url_map[title] = url print('found {} books'.format(len(title_url_map))) for i, (book, url) in enumerate(title_url_map.items()): if book in done: continue if sleep: time.sleep(sleep) if archived: url = get_orig_url(url) url = get_archived(url, update_old) print('processing {} {}'.format(book, url)) soup = get_soup(url) author = get_author(soup) if not author: print('author not found, skipping', book, url) continue plot_overview = get_plot_summary(soup, base_url, archived, update_old) plot_overview = '' section_summaries = [] for (section_name, url) in get_sections(soup, base_url, archived, update_old): orig_url = url if archived: url = get_archived(get_orig_url(url), update_old) summary = get_section_summary(url, base_url, archived, update_old) section_summaries.append((section_name, summary)) bs = BookSummary( title=book, author=author, genre=None, # TODO: Implement retrieving genre from external source plot_overview=plot_overview, source='cliffsnotes', section_summaries=section_summaries) book_summaries.append(bs) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from cliffsnotes'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def get_section_summary(url, base_url, archived=False, update_old=False): sense37, analysis_count = False, 0 # manual fix for this page with 2 Analysis headings if 'https://www.cliffsnotes.com/literature/s/sense-and-sensibility/summary-and-analysis/chapter-37' in url: sense37 = True analysis_found = False soup_all = get_soup(url) soup = soup_all.find(class_='copy') if not soup: # this happens if out of date, need to update the archive.org version print(f'{url} NO COPY CLASS!') return [] children = list(soup.children) section_summary = [] for i, child in enumerate(children): try: if len(child.findAll('p')) > 0: for c in child.children: try: if c.name == 'p': text = c.text.strip() if text == 'Analysis': analysis_found = True raise BreakIt if len(text) > 0 and text != 'Summary': section_summary.append(text) except AttributeError: continue elif child.name == 'p': text = child.text.strip() if sense37 and text == 'Analysis': sense37 = False continue elif text == 'Analysis': analysis_found = True break if len(text) > 0 and text != 'Summary': section_summary.append(text) elif child.name == 'h2' or child.name == 'h3': text = child.text.strip() if text == 'Analysis': analysis_found = True break except AttributeError: continue except BreakIt: break if len(section_summary) > 0 and not analysis_found: next_soup = soup_all.find(class_='small-6 columns clear-padding-right') if not next_soup: return section_summary href = next_soup.a['href'] if href.endswith('character-list'): return section_summary # if 'book-summary-2' in href: # TODO: delete this # next_url = 'https://' + get_orig_url(href) elif archived: next_url = get_archived(get_orig_url(href), update_old) else: next_url = urllib.parse.urljoin(base_url, href) is_continued = 'continued on next page' in section_summary[-1].lower() if is_continued: del section_summary[-1] cond = next_url.startswith(url) if is_continued or cond: soup = get_soup(next_url) try: summary = get_section_summary(next_url, base_url, archived, update_old) section_summary.extend(summary) except IndexError: pass return section_summary
def get_summaries(title_url_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() for title, url in title_url_map.items(): # iterate through books if title in done: continue if sleep: time.sleep(sleep) if archived: orig_url = url url = get_archived(url, update_old) print('processing', title, url) author = '' # TODO: figure this out soup = get_soup(url) contents = soup.find('table', id='Table56') if contents: idx = 3 else: contents = soup.find('table', width='99%') idx = 4 if not contents: print('table of contents not found on ', url) continue cells = contents.find('tbody').find_all( 'tr', recursive=False)[idx].find_all('a') cells = [x for x in cells if num_in(get_clean_text(x))] if not cells: print('no chapters found for ', url) continue sects = [] for c in cells: # iterate through sections text = get_clean_text(c) if 'Interpretation' in text: continue href = c['href'] link_summ = urllib.parse.urljoin(url, href) if archived: if '/' not in href: orig_url = urllib.parse.urljoin(get_orig_url(url), href) else: orig_url = get_orig_url(href) link_summ = get_archived(orig_url, update_old) paras = process_chapter(link_summ) if not paras: print('no summaries found on ', link_summ) continue text = standardize_section_titles(text) sects.append((text, paras)) book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=None, source='bookwolf', section_summaries=sects) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from bookwolf'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def get_section_summary(section_url, archived=False, update_old=False, retry=0, sleep=SLEEP): def _get_type(child): name = child.name if child.name in H3H4 or child.name == 'p' else None return name def _get_first(soup): summary = soup.find('div', {'class': 'studyGuideText'}) page_elements = list(summary.children) def _increment_ind( ind, page_elements=page_elements, ): while ind < len(page_elements) and _get_type( page_elements[ind]) is None: ind += 1 return ind ind = _increment_ind(0) elem = page_elements[ind] paragraphs = [] while _get_type(elem) == 'p': paragraphs.append(elem.text.strip().replace('\n', ' ')) ind = _increment_ind(ind + 1) elem = page_elements[ind] return paragraphs, ind def _scrape_page(soup, ind=0): sub_section_summaries = [] summary = soup.find('div', {'class': 'studyGuideText'}) page_elements = list(summary.children) def _increment_ind(ind, page_elements=page_elements): while ind < len(page_elements) and _get_type( page_elements[ind]) is None: ind += 1 return ind # reached first subsection heading while ind < len(page_elements): ind = _increment_ind(ind) elem = page_elements[ind] el_type = _get_type(elem) assert el_type == 'h3' or el_type == 'h4' sub_section_name = elem.text.strip() ind = _increment_ind(ind + 1) elem = page_elements[ind] paragraphs = [] while _get_type(elem) == 'p': paragraphs.append(elem.text.strip().replace('\n', ' ')) ind = _increment_ind(ind + 1) if ind == len(page_elements): break elem = page_elements[ind] sub_section_summaries.append((sub_section_name, paragraphs)) return sub_section_summaries # scrape main page soup = get_soup(section_url, sleep=sleep) title_tag = soup.find( class_='interior-header__title__pagetitle') or soup.find('h2') ERRORS = set(['Something bad happened. Sorry.', 'read ECONNRESET']) is_error_page = not title_tag or title_tag.text in ERRORS if retry == 0 and is_error_page: return get_section_summary(section_url, archived, update_old, retry=1) # elif retry == 1 and is_error_page: # archived_url = get_archived(section_url, update_old) # print('WARNING: could not load page {} , trying archived version from {}'.format(section_url, archived_url)) # return get_section_summary(archived_url, archived, update_old, retry=2) elif is_error_page: print('could not process {}'.format(section_url)) os._exit(-1) section_name = title_tag.text.strip() studyguide = soup.find('div', {'class': 'studyGuideText'}) if not studyguide.findAll(H3H4): paragraphs = _get_paragraphs(soup) summaries = [(section_name, paragraphs)] else: # skip any initial notes paragraphs, ind = _get_first(soup) summaries = _scrape_page(soup, ind) # scrape other pages, if any pagination = soup.find(class_='pagination-links') or \ soup.find(class_='interior-sticky-nav__navigation__list--short') or \ soup.find(class_='interior-sticky-nav__navigation') # # TODO: we can use below logic if sparknotes fixes www.sparknotes.com/lit/crime/section10/ , # # which has chapters 1-4 on page 2, then chapter 5 on page 3 # if summaries: # at_analysis = re.match(RE_ANALYSIS, summaries[-1][0]) # else: # at_analysis = False # if not at_analysis and pagination is not None: if pagination is not None: pages = pagination.findAll('a') for page in pages[1:]: page_url = urllib.parse.urljoin(section_url, page['href']) if archived: orig_url = urllib.parse.urljoin(get_orig_url(section_url), page['href']) page_url = get_archived(orig_url, update_old) page_url = page_url.replace( '/https://', '/', 1) # avoid strange bug with archive.org soup = get_soup(page_url, sleep=sleep) studyguide = soup.find('div', {'class': 'studyGuideText'}) if not studyguide: soup = get_soup(page_url, sleep=sleep) studyguide = soup.find('div', {'class': 'studyGuideText'}) # if not studyguide: # archived_url = get_archived(page_url) # print('WARNING: study guide not found for {} , trying archived version from {}'.format(page_url, archived_url)) # soup = get_soup(archived_url, sleep=sleep) # studyguide = soup.find('div', {'class': 'studyGuideText'}) if studyguide and not studyguide.findAll(H3H4): # no sub-sections, so get all paragraphs and add to previous paragraphs = _get_paragraphs(soup) summaries[-1][1].extend(paragraphs) else: # get paragraphs before first subsection paragraphs, ind = _get_first(soup) summaries[-1][1].extend(paragraphs) page_summaries = _scrape_page(soup, ind=ind) summaries.extend(page_summaries) return section_name, summaries
def get_summaries(page_title_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([(x.title, x.source) for x in book_summaries]) else: book_summaries = [] done = set() for page, title in page_title_map.items(): if 'barrons' in page.lower(): source = 'barrons' elif 'monkeynotes' in page.lower(): source = 'monkeynotes' if (title, source) in done: continue if sleep: time.sleep(sleep) if archived: page = get_archived(page, update_old) print('processing', title, page) author = '' # TODO: figure this out soup_book = get_soup(page) next_link = soup_book.find('a', text=RE_NEXT) story_link = soup_book.find('a', text=RE_STORY) is_hard_times = 'pinkmonkey.com/booknotes/barrons/hardtms.asp' in page if not (next_link or story_link or is_hard_times): print('cannot find any summaries for ', page) continue if is_paywall(page): print(' page is under a paywall, will be more errors: ', page) # continue if next_link: # monkeynotes href = next_link.get('href') url = urllib.parse.urljoin(get_orig_url(page), href) url = get_archived(url, update_old) sect_summs = process_next_link(url, archived, update_old) elif story_link: # barrons (most) url = page sect_summs = process_story_link(url, archived, update_old) elif is_hard_times: url = page sect_summs = process_next_link(url, archived, update_old) else: print('error') sys.exit() if not sect_summs: print(' Cannot process {}'.format(url)) # NOTE: expected to reach here for barrons Oliver Twist and barrons The Secret Sharer continue book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=None, source=source, section_summaries=sect_summs) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from pinkmonkey'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def process_next_link(link, archived, update_old): soup = get_soup(link) chapters = find_all_stripped('a', soup, RE_CHAP) if 'pmEthanFrome' in link: chapters += soup.find_all('a', text=RE_OPEN) elif 'pmDubliners' in link: h3s = soup.find_all('h3') for h3 in h3s: if h3.text.startswith('Short Story'): chapters = h3.find_next_sibling('p').find_all('a') elif 'wutherg' in link: if chapters[-3]['href'] != 'wutherg47.asp': chapters[-3]['href'] = 'wutherg47.asp' elif 'pmJungle' in link: if chapters[3]['href'] != 'pmJungle20.asp': chapters[3]['href'] = 'pmJungle20.asp' if chapters[9]['href'] != 'pmJungle31.asp': chapters[9]['href'] = 'pmJungle31.asp' if not chapters: return None section_summs = [] url_title_map = {} seen_urls = set() for c in chapters: href = c.get('href') title = get_clean_text(c) title = title if 'pmBabbitt' not in link else '' url = urllib.parse.urljoin(link, href) orig_url = url if 'dpbolvw' in url: continue dead_links1 = set(['pmVanity']) dead_links2 = set([ 'pmPrincePauper', 'pmIdiot', 'pmFatherSon', 'pmGreenwood', 'pmOfHuman' ]) dead_links3 = set(['pmDeerSlayer', 'pmTypee']) is_dead1 = any(x in orig_url for x in dead_links1) is_dead2 = any(x in orig_url for x in dead_links2) is_dead3 = any(x in orig_url for x in dead_links3) if is_dead1 or is_dead2 or is_dead3: # http://www.pinkmonkey.com:80/booknotes/monkeynotes/pmIdiot16.asp and up pages are dead # likewise for other strings page_no = int(re.findall('\d+', orig_url)[-1]) if is_dead1 and page_no >= 17: continue elif is_dead2 and page_no >= 16: continue elif is_dead3 and page_no >= 13: continue if orig_url in seen_urls: continue if archived: orig_url = urllib.parse.urljoin(get_orig_url(link), c.get('href')) url = get_archived(orig_url, update_old) url_title_map[url] = title seen_urls.add(orig_url) for url, title in url_title_map.items(): summs = process_story(url, title) for summ in summs: # print(' ', summ[0]) if summ[1]: # not empty text section_summs.append(summ) # manual fixes extra_sections = [] if 'pmWinesburg' in link: extra_sections = [ "pmWinesburg20.asp", "pmWinesburg21.asp", "pmWinesburg22.asp" ] elif 'pmDubliners' in link: extra_sections = [ "pmDubliners12.asp", "pmDubliners16.asp" ] # pmDubliners57.asp has no "Summary" heading, so skip if extra_sections: if archived: links_addtl = [ get_archived(urllib.parse.urljoin(get_orig_url(link), href), update_old) for href in extra_sections ] else: links_addtl = [ urllib.parse.urljoin(link, x) for x in extra_sections ] sect_summs_addtl = [process_story(x) for x in links_addtl] sect_summs_addtl = [x[0] for x in sect_summs_addtl] section_summs.extend(sect_summs_addtl) return section_summs
def get_summaries(title_url_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() for title, url in title_url_map.items(): title = title.replace("DeerSlayer", 'Deerslayer', 1) if title in done: continue if sleep: time.sleep(sleep) author = '' # TODO: figure this out archived_local = archived if archived: orig_url = url url = get_archived(url, update_old) print('processing', title, url) soup = get_soup(url, sleep=SLEEP) table = soup.find('div', id='block-booknavigation-3') or soup.find( 'div', id='block-block-4') # process plot summary plot_summ = None plot_cell = table.find('a', href=RE_PLOT_LINK) if plot_cell: plot_title = plot_cell.get_text() href = plot_cell['href'] if archived: plot_link = get_orig_url(href) plot_link = get_archived(plot_link, update_old) if 'archive.org' not in plot_link: # failed to retrieve archived version # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted time.sleep(5.0) archived_local = False else: plot_link = urllib.parse.urljoin(url, href) if 'Chapter' not in plot_title: plot_summ = process_plot(plot_link) if not plot_summ: print(' no plot summary found', plot_link) # process section summaries cells = table.find_all('a', href=RE_SUMM_LINK) if title == "The Brothers Karamazov": cells = sort_cells(cells) section_summs = [] if not cells: print(' no section links found for', url) continue seen_sects = set() for c in cells: section_title = get_clean_text(c) section_title_chap = section_title.rsplit(':', 1)[-1] if section_title_chap in seen_sects: print(' seen {} already, skipped'.format(section_title_chap)) continue if re.match(RE_PLOT, section_title): continue if archived and archived_local: link_summ = get_orig_url(c['href']) link_summ = get_archived(link_summ, update_old) else: link_summ = urllib.parse.urljoin(url, c['href']) try: page_summs = process_story(link_summ) except AttributeError: # page failed to load, try again print(' retrying after 5 seconds...') time.sleep(5.0) page_summs = process_story(link_summ) if page_summs: section_summs.extend(page_summs) seen_sects.add(section_title_chap) if not section_summs: print(' could not find summaries for {}'.format(title)) continue book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=plot_summ, source='novelguide', section_summaries=section_summs) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from novelguide'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries