def get_pages_titles(index_pages, books_list, title_set=None): book_pages = [] titles = [] for page_link in sorted(index_pages): # get book pages soup_page = get_soup(page_link) book_links = soup_page.find_all( 'p') # 1 p element has 1 or more book listings for b in book_links: for elem in b.find_all('a'): href = elem.get('href') basename = href.rsplit('/', 1)[-1] if href.endswith( '.asp' ) and 'notes' not in basename and 'first.asp' != basename: title = get_clean_text(elem) title = title.replace('Downloadable/Printable Version', '') title = title.replace(' ', '') if not title or title == 'Quotes' or title == 'Quotations' or title.startswith( 'Read the'): continue if title_set and title not in title_set: continue book_pages.append(href) titles.append(title) book_pages = get_absolute_links(book_pages, books_list) return book_pages, titles
def is_paywall(link): soup = get_soup(link) for p in soup.find_all('p'): p_text = get_clean_text(p) if p_text.startswith('NOTICE: Unfortunately'): return True return False
def get_title_url_map(books_list, title_set=None): soup = get_soup(books_list, sleep=SLEEP) # book_links = soup.find('table', class_='views-table cols-2').find_all('a') book_links = soup.find('table', class_='cols-2').find_all('a') title_url_map = {} for link in book_links: title = get_clean_text(link).replace(' Study Guide', '') if title_set and title not in title_set: continue link = link.get('href') title_url_map[title] = urllib.parse.urljoin(books_list, link) return title_url_map
def find_articles(search_query, domain): full_links = [] for page in range(1, 4): print("Fetching page ", page) url = get_url(search_query, domain, page) soup = scrape_lib.get_soup(url) links = get_links(soup) links = filter_links(links, domain) full_links = full_links + links return full_links
def get_plot_summary(url, sleep=SLEEP): soup = get_soup(url, sleep=sleep) pagination = soup.find(class_='pagination-links') or soup.find( class_='interior-sticky-nav__navigation__list--short') assert pagination is None studyguide = soup.find('div', {'class': 'studyGuideText'}) if not studyguide: soup = get_soup(url, sleep=sleep) studyguide = soup.find('div', {'class': 'studyGuideText'}) if not studyguide: archived_url = get_archived(url) print( 'WARNING: study guide not found for {} , trying archived version from {}' .format(url, archived_url)) soup = get_soup(archived_url, sleep=sleep) studyguide = soup.find('div', {'class': 'studyGuideText'}) if studyguide and not studyguide.findAll(H3H4): return _get_paragraphs(soup) else: return get_section_summary(url)[1]
def get_title_url_map(books_list, title_set=None): soup = get_soup(books_list) columns = soup.find_all('table', width=None)[1].find_all('table') title_url_map = {} for column in columns: cells = column.find_all('tr') for cell in cells: p = cell.find('p') entries = p.find_all('a') for entry in entries: title = get_clean_text(entry) if title_set and title not in title_set: continue href = entry.get('href') title_url_map[title] = urllib.parse.urljoin(books_list, href) return title_url_map
def process_chapter(link): soup = get_soup(link) summ_lines = find_all_stripped('p', soup, RE_SUMM) or find_all_stripped( 'b', soup, RE_CONTEXT) # manual fixes if link == 'http://www.bookwolf.com/Free_Booknotes/King_Lear_free_booknotes/Act_1_Scene_1_-_King_Lear/act_1_scene_1_-_king_lear.html': summ_lines = find_all_stripped('p', soup, RE_LEAR) elif link == 'http://www.bookwolf.com/Free_Booknotes/Othello/Act_3_Scene_2_-_Othello_Bookno/act_3_scene_2_-_othello_bookno.html': summ_lines = find_all_stripped('p', soup, 'ACT III – Scene.ii') if len(summ_lines) > 1: print('error, more than 1 summ line: ', link) return elif not summ_lines: print('no summ lines found: ', link) return ps = summ_lines[0].find_all_next('p') paragraphs = process_paragraphs(ps) return paragraphs
def get_index_pages(books_list, source): soup = get_soup(books_list) if source == 'monkeynotes': tables = soup.find_all('font', color='#339900', face='Arial, Helvetica') elif source == 'barrons': tables = soup.find_all('font', color='white', face='Verdana, Arial, Helvetica, sans-serif') index_pages = [] for table in tables: # get index pages for each letter entries = table.find_all('a') for entry in entries: href = entry.get('href') index_pages.append(href) index_pages = set(urllib.parse.urljoin(books_list, x) for x in index_pages) return index_pages
def process_plot(link): plot_summ = [] soup = get_soup(link, sleep=SLEEP) content = soup.find('div', id='content-content') paras = content.find_all('p') for p in paras: text = get_clean_text(p, strip=False) bold = p.find(['b', 'strong']) if bold: if bold.get_text() == 'Analysis': break sibs = list(bold.next_siblings) if sibs: text = str(sibs[-1]) else: continue if p and not text.startswith('Log in'): plot_summ.append(text) return plot_summ
def get_plot_section_urls(url, base_url=BASE_URL, archived=False, update_old=False, sleep=SLEEP): soup = get_soup(url, sleep=sleep) plot_url = urllib.parse.urljoin(url, 'summary/') status_code = requests.get(plot_url).status_code if status_code in set([404, 500]): try: plot_url = get_archived(plot_url) except requests.exceptions.HTTPError as e: print('plot url not found at', plot_url) plot_url = '' section_urls = [] seen = set() # lists = soup.find_all(class_='lists') # litems = lists[0].find_all('li') lists = soup.find_all(class_='landing-page__umbrella__section__list') litems = lists[1].find_all('li') if not litems: litems = lists[1].findAll('li') if len(litems) == 1: litems = lists[2].findAll('li') for item in litems: if not item.a: continue href = item.a.get('href') # if not href: # pass if 'section' in href: if archived: orig_url = get_orig_url(href) url = get_archived(orig_url, update_old) else: url = urllib.parse.urljoin(base_url, item.a['href']) orig_url = url if orig_url not in seen: section_urls.append(url) seen.add(orig_url) return plot_url, section_urls
def get_section_summary(url): soup = get_soup(url) children = list(soup.find(class_='section__article').children) def _is_heading(child): if child.name not in ['h2', 'h3', 'h4', 'p']: return False words = child.text.lower().strip().split() if child.name in ['h2', 'h3', 'h4']: return True elif child.strong is not None: return True elif child.name == 'p' and len(words) < 20: if any(heading in words for heading in HEADINGS): return True else: return False section_summary = [] ind = 0 while ind < len(children): if not _is_heading(children[ind]) and children[ind].name != 'p': ind += 1 continue #New sub-section if _is_heading(children[ind]): sub_section_name = children[ind].text.strip() ind += 1 else: sub_section_name = None subsection = [] while ind < len(children) and not _is_heading(children[ind]): if children[ind].name == 'p': subsection.append(children[ind].text.strip()) ind += 1 if sub_section_name and 'analysis' in sub_section_name.lower(): continue section_summary.append((sub_section_name, subsection)) return section_summary
def process_story_link(link, archived, update_old): soup = get_soup(link) stories = soup.find_all('a', text=RE_STORY) if not stories: return None section_summs = [] for story in stories: # a story page has multiple chapters href = story.get('href') ## For page http://www.pinkmonkey.com/booknotes/barrons/billbud.asp , we want Typee, but not Billy Budd if not href or href.startswith('billbud'): continue if archived: url = urllib.parse.urljoin(get_orig_url(link), href) url = get_archived(url, update_old) else: url = urllib.parse.urljoin(link, href) summs = process_story(url) if summs: section_summs.extend(summs) return section_summs
def get_summaries(books_list, base_url, out_name, pane_name, use_pickled=False, title_set=None, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name) and os.path.getsize(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() soup = get_soup(books_list) title_url_map = {} for link in soup.find(class_='alphabits').findAll('li'): page_url = urllib.parse.urljoin(base_url, link.a['href']) soup = get_soup(page_url) for book in soup.find(class_='columnList').findAll('li'): title = book.a.text.strip() if title_set and title not in title_set: continue url = urllib.parse.urljoin(base_url, book.a['href']) title_url_map[title] = url print('found {} books'.format(len(title_url_map))) for i, (book, url) in enumerate(title_url_map.items()): if book in done: continue if sleep: time.sleep(sleep) if archived: url = get_archived(url, update_old) print('processing {} {}'.format(book, url)) soup = get_soup(url) author = get_author(soup) plot_overview = get_plot_summary(soup, pane_name, base_url, archived, update_old) section_summaries = [] sections = get_sections(soup, pane_name, base_url, archived, update_old) for (section_name, url) in sections: summary = get_section_summary(url) section_summaries.append((section_name, summary)) bs = BookSummary( title=book, author=author, genre= None, # TODO: Need to fix this and get genre from external source plot_overview=plot_overview, source='gradesaver', section_summaries=section_summaries) book_summaries.append(bs) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: print("Done scraping {} books".format(num_books)) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('Scraped {} books from gradesaver'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def get_summaries(title_url_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() for title, url in title_url_map.items(): title = title.replace("DeerSlayer", 'Deerslayer', 1) if title in done: continue if sleep: time.sleep(sleep) author = '' # TODO: figure this out archived_local = archived if archived: orig_url = url url = get_archived(url, update_old) print('processing', title, url) soup = get_soup(url, sleep=SLEEP) table = soup.find('div', id='block-booknavigation-3') or soup.find( 'div', id='block-block-4') # process plot summary plot_summ = None plot_cell = table.find('a', href=RE_PLOT_LINK) if plot_cell: plot_title = plot_cell.get_text() href = plot_cell['href'] if archived: plot_link = get_orig_url(href) plot_link = get_archived(plot_link, update_old) if 'archive.org' not in plot_link: # failed to retrieve archived version # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted time.sleep(5.0) archived_local = False else: plot_link = urllib.parse.urljoin(url, href) if 'Chapter' not in plot_title: plot_summ = process_plot(plot_link) if not plot_summ: print(' no plot summary found', plot_link) # process section summaries cells = table.find_all('a', href=RE_SUMM_LINK) if title == "The Brothers Karamazov": cells = sort_cells(cells) section_summs = [] if not cells: print(' no section links found for', url) continue seen_sects = set() for c in cells: section_title = get_clean_text(c) section_title_chap = section_title.rsplit(':', 1)[-1] if section_title_chap in seen_sects: print(' seen {} already, skipped'.format(section_title_chap)) continue if re.match(RE_PLOT, section_title): continue if archived and archived_local: link_summ = get_orig_url(c['href']) link_summ = get_archived(link_summ, update_old) else: link_summ = urllib.parse.urljoin(url, c['href']) try: page_summs = process_story(link_summ) except AttributeError: # page failed to load, try again print(' retrying after 5 seconds...') time.sleep(5.0) page_summs = process_story(link_summ) if page_summs: section_summs.extend(page_summs) seen_sects.add(section_title_chap) if not section_summs: print(' could not find summaries for {}'.format(title)) continue book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=plot_summ, source='novelguide', section_summaries=section_summs) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from novelguide'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def get_summaries(books_list, base_url, out_name, use_pickled=False, archived=False, title_set=None, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() soup = get_soup(books_list) title_url_map = {} for book in soup.find(class_='content active').findAll('li'): title = book.find('h4').text.strip() if title_set and title not in title_set: continue url = urllib.parse.urljoin(BASE_URL, book.a['href']) title_url_map[title] = url print('found {} books'.format(len(title_url_map))) for i, (book, url) in enumerate(title_url_map.items()): if book in done: continue if sleep: time.sleep(sleep) if archived: url = get_orig_url(url) url = get_archived(url, update_old) print('processing {} {}'.format(book, url)) soup = get_soup(url) author = get_author(soup) if not author: print('author not found, skipping', book, url) continue plot_overview = get_plot_summary(soup, base_url, archived, update_old) plot_overview = '' section_summaries = [] for (section_name, url) in get_sections(soup, base_url, archived, update_old): orig_url = url if archived: url = get_archived(get_orig_url(url), update_old) summary = get_section_summary(url, base_url, archived, update_old) section_summaries.append((section_name, summary)) bs = BookSummary( title=book, author=author, genre=None, # TODO: Implement retrieving genre from external source plot_overview=plot_overview, source='cliffsnotes', section_summaries=section_summaries) book_summaries.append(bs) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from cliffsnotes'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def get_section_summary(url, base_url, archived=False, update_old=False): sense37, analysis_count = False, 0 # manual fix for this page with 2 Analysis headings if 'https://www.cliffsnotes.com/literature/s/sense-and-sensibility/summary-and-analysis/chapter-37' in url: sense37 = True analysis_found = False soup_all = get_soup(url) soup = soup_all.find(class_='copy') if not soup: # this happens if out of date, need to update the archive.org version print(f'{url} NO COPY CLASS!') return [] children = list(soup.children) section_summary = [] for i, child in enumerate(children): try: if len(child.findAll('p')) > 0: for c in child.children: try: if c.name == 'p': text = c.text.strip() if text == 'Analysis': analysis_found = True raise BreakIt if len(text) > 0 and text != 'Summary': section_summary.append(text) except AttributeError: continue elif child.name == 'p': text = child.text.strip() if sense37 and text == 'Analysis': sense37 = False continue elif text == 'Analysis': analysis_found = True break if len(text) > 0 and text != 'Summary': section_summary.append(text) elif child.name == 'h2' or child.name == 'h3': text = child.text.strip() if text == 'Analysis': analysis_found = True break except AttributeError: continue except BreakIt: break if len(section_summary) > 0 and not analysis_found: next_soup = soup_all.find(class_='small-6 columns clear-padding-right') if not next_soup: return section_summary href = next_soup.a['href'] if href.endswith('character-list'): return section_summary # if 'book-summary-2' in href: # TODO: delete this # next_url = 'https://' + get_orig_url(href) elif archived: next_url = get_archived(get_orig_url(href), update_old) else: next_url = urllib.parse.urljoin(base_url, href) is_continued = 'continued on next page' in section_summary[-1].lower() if is_continued: del section_summary[-1] cond = next_url.startswith(url) if is_continued or cond: soup = get_soup(next_url) try: summary = get_section_summary(next_url, base_url, archived, update_old) section_summary.extend(summary) except IndexError: pass return section_summary
def get_info(self, url): soup = scrape_lib.get_soup(url) headline = self.get_headline(soup) published_date = self.get_published_date(soup) content = self.get_content(soup) return [str(headline), str(published_date), str(content)]
def process_story(link, title=None, get_next=True, find_continued=False): """ returns tuples of (title, summary list) format """ soup = get_soup(link) chapters = [] if find_continued: lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM_CONTINUED) if not lines: return [] ### specific edge cases elif 'WhiteFang' in link: lines = find_all_stripped( ['p', 'h4'], soup, RE_CHAP) + find_all_stripped(['p', 'h4'], soup, RE_SUMM) elif 'Ulysses' in link: lines = find_all_stripped('p', soup, RE_SUMM_3) elif 'pmKidnapped16' in link: find_all_stripped(['p', 'h4'], soup, RE_SUMM)[0].extract() lines = find_all_stripped(['p', 'h4'], soup, RE_CHAP) ### else: lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM) or find_all_stripped(['p', 'h4'], soup, RE_SUMM_2) or \ find_all_stripped(['p', 'h4'], soup, RE_CHAP) lines = [ x for x in lines if (x.find('b') and x.find('b').get_text(strip=True)) or x.name == 'h4' ] # line should be bold if not lines or 'barrons/house' in link: lines.extend(find_all_stripped(['p', 'h4'], soup, RE_NUMDOT)) if not lines: print(' cannot find section titles on', link) return [] if 'pmFrankenstein10' in link: lines = lines[1:] frank_cond = 'pmFrankenstein' in link and not any( get_clean_text(lines[0]).startswith(x) for x in ('Summary', 'LETTER')) if 'barrons/heartdk' in link or frank_cond: lines = [lines[0].find_next('p')] for line in lines: if len(lines) > 1 or not title: title_ = line if not re.match( RE_SUMM, get_clean_text(line)) else line.find_previous('p') title_ = get_clean_text(title_) else: title_ = title if 'pmIdiot' in link or 'pmSecretSharer' in link: ps = line.find_all_next(['p', 'b']) elif 'wutherg' in link or 'Ulysses' in link: ps = [] indiv_strs = [] for sib in line.next_siblings: if sib.name == 'p': if indiv_strs: p = element.Tag(name='p') p.string = ' '.join(indiv_strs) ps.append(p) indiv_strs = [] ps.append(sib) elif isinstance(sib, element.NavigableString) and \ not (sib.endswith("Barron's Booknotes\n") or sib.startswith("MonkeyNotes")): indiv_strs.append(sib) if indiv_strs: p = element.Tag(name='p') p.string = ' '.join(indiv_strs) ps.append(p) else: ps = line.find_all_next(['p', 'h4']) paragraphs = process_paragraphs(ps) chapters.append((title_, paragraphs)) if 'junglex' in link: # this should be moved to manual_fix_individual() assert chapters[3][0] == 'CHAPTER 17' assert chapters[7][0] == 'CHAPTER 18' clean_scene = lambda x: re.sub('SCENE \d', '', x, 1) chapter17 = [ *chapters[3][1], clean_scene(chapters[4][0]), *chapters[4][1], clean_scene(chapters[5][0]), *chapters[5][1], clean_scene(chapters[6][0]) ] del chapters[6] del chapters[5] del chapters[4] chapters[3] = (chapters[3][0], chapter17) if get_next and chapters: # check next page if is continued next_elem = soup.find('a', text=RE_NEXT) if not next_elem: pass else: next_link = urllib.parse.urljoin(link, next_elem['href']) chapters2 = process_story(next_link, get_next=get_next, find_continued=True) if not chapters2: pass elif len(chapters2) == 1: title1, paragraphs1 = chapters.pop(-1) title2, paragraphs2 = chapters2[0] chapters.append((title1, paragraphs1 + paragraphs2)) return chapters
def get_summaries(title_url_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() for title, url in title_url_map.items(): # iterate through books if title in done: continue if sleep: time.sleep(sleep) if archived: orig_url = url url = get_archived(url, update_old) print('processing', title, url) author = '' # TODO: figure this out soup = get_soup(url) contents = soup.find('table', id='Table56') if contents: idx = 3 else: contents = soup.find('table', width='99%') idx = 4 if not contents: print('table of contents not found on ', url) continue cells = contents.find('tbody').find_all( 'tr', recursive=False)[idx].find_all('a') cells = [x for x in cells if num_in(get_clean_text(x))] if not cells: print('no chapters found for ', url) continue sects = [] for c in cells: # iterate through sections text = get_clean_text(c) if 'Interpretation' in text: continue href = c['href'] link_summ = urllib.parse.urljoin(url, href) if archived: if '/' not in href: orig_url = urllib.parse.urljoin(get_orig_url(url), href) else: orig_url = get_orig_url(href) link_summ = get_archived(orig_url, update_old) paras = process_chapter(link_summ) if not paras: print('no summaries found on ', link_summ) continue text = standardize_section_titles(text) sects.append((text, paras)) book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=None, source='bookwolf', section_summaries=sects) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from bookwolf'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def get_summaries(guides_page, base_url, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, title_set=None, sleep=SLEEP, flatten=True): def add_summaries(url, section_summaries, flatten=True): # helper function summary_obj = get_section_summary(url, archived, update_old) multisect_title, sect_summs = summary_obj logging.info(multisect_title) if flatten: for sect_summ in sect_summs: sect_title, sect_paras = sect_summ if sect_title == 'Summary': sect_title = multisect_title if re.match(RE_ANALYSIS, sect_title): continue logging.info(sect_title) summary_obj_new = (sect_title, sect_paras) section_summaries.append(summary_obj_new) else: section_summaries.append(summary_obj) if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([x.title for x in book_summaries]) else: book_summaries = [] done = set() soup = get_soup(guides_page, sleep=sleep) title_url_map = {} for section in soup.findAll('section'): for book in section.findAll('h4'): title = book.a.text.strip() if title_set and title not in title_set: continue url = urllib.parse.urljoin(base_url, book.a['href']) title_url_map[title] = url print('found {} books'.format(len(title_url_map))) for i, (book, url) in enumerate(title_url_map.items()): if book in done: continue if archived: url = get_archived(url, update_old) print('processing {} {}'.format(book, url)) soup = get_soup(url, sleep=sleep) author = get_author(soup) if not author: print('author not found, skipping', book, url) continue plot_url, section_urls = get_plot_section_urls(url, base_url, archived, update_old) if plot_url: plot_overview = get_plot_summary(plot_url) else: plot_overview = None section_summaries = [] for url in section_urls: add_summaries(url, section_summaries) if book == 'The Yellow Wallpaper': section_summaries = [('Book', plot_overview)] if not section_summaries: continue bs = BookSummary( title=book, author=author, genre=None, # TODO: get genre from external source plot_overview=plot_overview, source='sparknotes', section_summaries=section_summaries) book_summaries.append(bs) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from sparknotes'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries
def get_section_summary(section_url, archived=False, update_old=False, retry=0, sleep=SLEEP): def _get_type(child): name = child.name if child.name in H3H4 or child.name == 'p' else None return name def _get_first(soup): summary = soup.find('div', {'class': 'studyGuideText'}) page_elements = list(summary.children) def _increment_ind( ind, page_elements=page_elements, ): while ind < len(page_elements) and _get_type( page_elements[ind]) is None: ind += 1 return ind ind = _increment_ind(0) elem = page_elements[ind] paragraphs = [] while _get_type(elem) == 'p': paragraphs.append(elem.text.strip().replace('\n', ' ')) ind = _increment_ind(ind + 1) elem = page_elements[ind] return paragraphs, ind def _scrape_page(soup, ind=0): sub_section_summaries = [] summary = soup.find('div', {'class': 'studyGuideText'}) page_elements = list(summary.children) def _increment_ind(ind, page_elements=page_elements): while ind < len(page_elements) and _get_type( page_elements[ind]) is None: ind += 1 return ind # reached first subsection heading while ind < len(page_elements): ind = _increment_ind(ind) elem = page_elements[ind] el_type = _get_type(elem) assert el_type == 'h3' or el_type == 'h4' sub_section_name = elem.text.strip() ind = _increment_ind(ind + 1) elem = page_elements[ind] paragraphs = [] while _get_type(elem) == 'p': paragraphs.append(elem.text.strip().replace('\n', ' ')) ind = _increment_ind(ind + 1) if ind == len(page_elements): break elem = page_elements[ind] sub_section_summaries.append((sub_section_name, paragraphs)) return sub_section_summaries # scrape main page soup = get_soup(section_url, sleep=sleep) title_tag = soup.find( class_='interior-header__title__pagetitle') or soup.find('h2') ERRORS = set(['Something bad happened. Sorry.', 'read ECONNRESET']) is_error_page = not title_tag or title_tag.text in ERRORS if retry == 0 and is_error_page: return get_section_summary(section_url, archived, update_old, retry=1) # elif retry == 1 and is_error_page: # archived_url = get_archived(section_url, update_old) # print('WARNING: could not load page {} , trying archived version from {}'.format(section_url, archived_url)) # return get_section_summary(archived_url, archived, update_old, retry=2) elif is_error_page: print('could not process {}'.format(section_url)) os._exit(-1) section_name = title_tag.text.strip() studyguide = soup.find('div', {'class': 'studyGuideText'}) if not studyguide.findAll(H3H4): paragraphs = _get_paragraphs(soup) summaries = [(section_name, paragraphs)] else: # skip any initial notes paragraphs, ind = _get_first(soup) summaries = _scrape_page(soup, ind) # scrape other pages, if any pagination = soup.find(class_='pagination-links') or \ soup.find(class_='interior-sticky-nav__navigation__list--short') or \ soup.find(class_='interior-sticky-nav__navigation') # # TODO: we can use below logic if sparknotes fixes www.sparknotes.com/lit/crime/section10/ , # # which has chapters 1-4 on page 2, then chapter 5 on page 3 # if summaries: # at_analysis = re.match(RE_ANALYSIS, summaries[-1][0]) # else: # at_analysis = False # if not at_analysis and pagination is not None: if pagination is not None: pages = pagination.findAll('a') for page in pages[1:]: page_url = urllib.parse.urljoin(section_url, page['href']) if archived: orig_url = urllib.parse.urljoin(get_orig_url(section_url), page['href']) page_url = get_archived(orig_url, update_old) page_url = page_url.replace( '/https://', '/', 1) # avoid strange bug with archive.org soup = get_soup(page_url, sleep=sleep) studyguide = soup.find('div', {'class': 'studyGuideText'}) if not studyguide: soup = get_soup(page_url, sleep=sleep) studyguide = soup.find('div', {'class': 'studyGuideText'}) # if not studyguide: # archived_url = get_archived(page_url) # print('WARNING: study guide not found for {} , trying archived version from {}'.format(page_url, archived_url)) # soup = get_soup(archived_url, sleep=sleep) # studyguide = soup.find('div', {'class': 'studyGuideText'}) if studyguide and not studyguide.findAll(H3H4): # no sub-sections, so get all paragraphs and add to previous paragraphs = _get_paragraphs(soup) summaries[-1][1].extend(paragraphs) else: # get paragraphs before first subsection paragraphs, ind = _get_first(soup) summaries[-1][1].extend(paragraphs) page_summaries = _scrape_page(soup, ind=ind) summaries.extend(page_summaries) return section_name, summaries
def process_story(link, title=None): link = link.replace('http://www.novelguide.com', 'https://www.novelguide.com', 1) chapters = [] soup = get_soup(link, sleep=SLEEP) if 'mansfield-park/' in link or 'jude-the-obscure' in link: content = soup.find('div', class_='content clear-block') paras = content.find_all(['p', 'strong', 'div'])[2:] else: content = soup.find('div', id='content-content') paras = content.find_all('p') if link.endswith('the-adventures-of-tom-sawyer/novel-summary'): initial = paras[1].children.__next__() initial.insert_before(paras[0]) sect_summ = [] title = get_title(soup) break_found = False write = True if 'ivan-fyodorovich' in link: # this page from The Brothers Karamazov is different from the others texts = [p.text for p in paras] summs = colon_section(texts, title) summs[9] = (summs[9][0], summs[9][1][:-7]) chapters.extend(summs) else: for p in paras: text = get_clean_text(p, strip=False).strip() if not text or text.startswith('Log in'): continue br = p.find_all('br') if any(x in link for x in NONBOLD_WITH_SECTIONS): texts = list(p.stripped_strings) chapters.extend(other_section(texts, title)) elif any(x in link for x in set([ 'ulysses', 'siddhartha', 'awakening', 'brothers-karamazov', 'tess-of', 'the-ambass', 'jekyll', 'heart-of-darkness', 'winesburg' ])): texts = list(p.stripped_strings) chapters.extend(other_section(texts, title, always_write=True)) elif any(x in link for x in set(['monte-cristo'])): texts = list(p.stripped_strings) chapters.extend(colon_section(texts, title)) elif (len(br) > 3 or re.match(RE_CHAP_OPEN, p.get_text()) or any(x in link for x in BREAK_TITLES)) and \ 'fathers-and-sons' not in link and 'hound' not in link: break_found = True chapters.extend(process_chapters(p, title)) title = list(p.stripped_strings)[0] else: # for sections where the text is in multiple <p> tags if text == 'advertisement' and not 'the-awakening' in link: break elif text == 'advertisement': continue bold = p if p.name == 'strong' else p.find(['b', 'strong']) if bold: write = True bold_text = bold.get_text(strip=True) is_summ = re.match(RE_PLOT, bold_text) if any(bold_text.startswith(x) for x in ANALYSIS): write = False if sect_summ: chapters.append((title, sect_summ)) sect_summ = [] continue elif not is_summ: if sect_summ: chapters.append((title, sect_summ)) title = bold_text if not is_summ else title sect_summ = [] sibs = list(bold.next_siblings) if write and sibs: sibs = [x.strip() for x in sibs if isinstance(x, str)] text = ' '.join(sibs).strip() sect_summ.append(text) elif text == 'Analysis': write = False continue else: if write: sect_summ.append(text) if not break_found and sect_summ: chapters.append((title, sect_summ)) for i, chapter in enumerate(chapters): norm = [unicodedata.normalize("NFKD", p).strip() for p in chapter[1]] norm = [x for x in norm if x] chapters[i] = (chapters[i][0], norm) return chapters
def process_next_link(link, archived, update_old): soup = get_soup(link) chapters = find_all_stripped('a', soup, RE_CHAP) if 'pmEthanFrome' in link: chapters += soup.find_all('a', text=RE_OPEN) elif 'pmDubliners' in link: h3s = soup.find_all('h3') for h3 in h3s: if h3.text.startswith('Short Story'): chapters = h3.find_next_sibling('p').find_all('a') elif 'wutherg' in link: if chapters[-3]['href'] != 'wutherg47.asp': chapters[-3]['href'] = 'wutherg47.asp' elif 'pmJungle' in link: if chapters[3]['href'] != 'pmJungle20.asp': chapters[3]['href'] = 'pmJungle20.asp' if chapters[9]['href'] != 'pmJungle31.asp': chapters[9]['href'] = 'pmJungle31.asp' if not chapters: return None section_summs = [] url_title_map = {} seen_urls = set() for c in chapters: href = c.get('href') title = get_clean_text(c) title = title if 'pmBabbitt' not in link else '' url = urllib.parse.urljoin(link, href) orig_url = url if 'dpbolvw' in url: continue dead_links1 = set(['pmVanity']) dead_links2 = set([ 'pmPrincePauper', 'pmIdiot', 'pmFatherSon', 'pmGreenwood', 'pmOfHuman' ]) dead_links3 = set(['pmDeerSlayer', 'pmTypee']) is_dead1 = any(x in orig_url for x in dead_links1) is_dead2 = any(x in orig_url for x in dead_links2) is_dead3 = any(x in orig_url for x in dead_links3) if is_dead1 or is_dead2 or is_dead3: # http://www.pinkmonkey.com:80/booknotes/monkeynotes/pmIdiot16.asp and up pages are dead # likewise for other strings page_no = int(re.findall('\d+', orig_url)[-1]) if is_dead1 and page_no >= 17: continue elif is_dead2 and page_no >= 16: continue elif is_dead3 and page_no >= 13: continue if orig_url in seen_urls: continue if archived: orig_url = urllib.parse.urljoin(get_orig_url(link), c.get('href')) url = get_archived(orig_url, update_old) url_title_map[url] = title seen_urls.add(orig_url) for url, title in url_title_map.items(): summs = process_story(url, title) for summ in summs: # print(' ', summ[0]) if summ[1]: # not empty text section_summs.append(summ) # manual fixes extra_sections = [] if 'pmWinesburg' in link: extra_sections = [ "pmWinesburg20.asp", "pmWinesburg21.asp", "pmWinesburg22.asp" ] elif 'pmDubliners' in link: extra_sections = [ "pmDubliners12.asp", "pmDubliners16.asp" ] # pmDubliners57.asp has no "Summary" heading, so skip if extra_sections: if archived: links_addtl = [ get_archived(urllib.parse.urljoin(get_orig_url(link), href), update_old) for href in extra_sections ] else: links_addtl = [ urllib.parse.urljoin(link, x) for x in extra_sections ] sect_summs_addtl = [process_story(x) for x in links_addtl] sect_summs_addtl = [x[0] for x in sect_summs_addtl] section_summs.extend(sect_summs_addtl) return section_summs
def get_summaries(page_title_map, out_name, use_pickled=False, archived=False, update_old=False, save_every=5, sleep=0): if use_pickled and os.path.exists(out_name): with open(out_name, 'rb') as f1: book_summaries = pickle.load(f1) print('loaded {} existing summaries, resuming'.format( len(book_summaries))) done = set([(x.title, x.source) for x in book_summaries]) else: book_summaries = [] done = set() for page, title in page_title_map.items(): if 'barrons' in page.lower(): source = 'barrons' elif 'monkeynotes' in page.lower(): source = 'monkeynotes' if (title, source) in done: continue if sleep: time.sleep(sleep) if archived: page = get_archived(page, update_old) print('processing', title, page) author = '' # TODO: figure this out soup_book = get_soup(page) next_link = soup_book.find('a', text=RE_NEXT) story_link = soup_book.find('a', text=RE_STORY) is_hard_times = 'pinkmonkey.com/booknotes/barrons/hardtms.asp' in page if not (next_link or story_link or is_hard_times): print('cannot find any summaries for ', page) continue if is_paywall(page): print(' page is under a paywall, will be more errors: ', page) # continue if next_link: # monkeynotes href = next_link.get('href') url = urllib.parse.urljoin(get_orig_url(page), href) url = get_archived(url, update_old) sect_summs = process_next_link(url, archived, update_old) elif story_link: # barrons (most) url = page sect_summs = process_story_link(url, archived, update_old) elif is_hard_times: url = page sect_summs = process_next_link(url, archived, update_old) else: print('error') sys.exit() if not sect_summs: print(' Cannot process {}'.format(url)) # NOTE: expected to reach here for barrons Oliver Twist and barrons The Secret Sharer continue book_summ = BookSummary(title=title, author=author, genre=None, plot_overview=None, source=source, section_summaries=sect_summs) book_summaries.append(book_summ) num_books = len(book_summaries) if num_books > 1 and num_books % save_every == 0: with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print("Done scraping {} books".format(num_books)) print('Scraped {} books from pinkmonkey'.format(len(book_summaries))) with open(out_name, 'wb') as f: pickle.dump(book_summaries, f) print('wrote to', out_name) return book_summaries