Python get_archived Examples, archive_lib.get_archived Python Examples

Example #1

0

Show file

def get_sections(soup, pane_name, base_url, archived=False, update_old=False):
    summaries = None
    for link in soup.find(class_=pane_name).findAll('li'):
        if 'summary and analysis' in link.text.lower().strip():
            summaries = link
            break

    sections = []
    try:
        for link in summaries.findAll('li'):
            name = link.text.strip()
            url = urllib.parse.urljoin(base_url, link.a['href'])
            if archived:
                orig_url = get_orig_url(link.a['href'])
                url = get_archived(orig_url, update_old)

            sections.append((name, url))
    except AttributeError:
        pass

    if len(sections) == 0:
        try:
            name = summaries.text.strip()
            url = urllib.parse.urljoin(base_url, summaries.a['href'])
            if archived:
                orig_url = get_orig_url(summaries.a['href'])
                url = get_archived(orig_url, update_old)

            sections.append((name, url))
        except:
            pass

    return sections

Example #2

0

Show file

def get_plot_section_urls(url,
                          base_url=BASE_URL,
                          archived=False,
                          update_old=False,
                          sleep=SLEEP):
    soup = get_soup(url, sleep=sleep)
    plot_url = urllib.parse.urljoin(url, 'summary/')
    status_code = requests.get(plot_url).status_code
    if status_code in set([404, 500]):
        try:
            plot_url = get_archived(plot_url)
        except requests.exceptions.HTTPError as e:
            print('plot url not found at', plot_url)
            plot_url = ''

    section_urls = []
    seen = set()
    # lists = soup.find_all(class_='lists')
    # litems = lists[0].find_all('li')
    lists = soup.find_all(class_='landing-page__umbrella__section__list')
    litems = lists[1].find_all('li')
    if not litems:
        litems = lists[1].findAll('li')
    if len(litems) == 1:
        litems = lists[2].findAll('li')
    for item in litems:
        if not item.a:
            continue
        href = item.a.get('href')
        # if not href:
        #     pass
        if 'section' in href:
            if archived:
                orig_url = get_orig_url(href)
                url = get_archived(orig_url, update_old)
            else:
                url = urllib.parse.urljoin(base_url, item.a['href'])
                orig_url = url
            if orig_url not in seen:
                section_urls.append(url)
                seen.add(orig_url)

    return plot_url, section_urls

Example #3

0

Show file

def get_plot_summary(soup, pane_name, base_url, archived, update_old):
    summaries = []
    for link in soup.find(class_=pane_name).findAll('li'):
        if 'summary' in link.text.lower():
            summaries.append(link)

    if len(summaries) > 1:
        # Assumption first one is overall book/play summary. Hold for most cases
        if archived:
            orig_url = get_orig_url(summaries[0].a['href'])
            link = get_archived(orig_url, update_old)
        else:
            link = urllib.parse.urljoin(base_url, summaries[0].a['href'])
        plot_summary = get_section_summary(link)
        return plot_summary
    else:
        return None

Example #4

0

Show file

File: cliffsnotes_scrape.py Project: manestay/novel-chapter-dataset

def get_sections(soup,
                 base_url,
                 archived=False,
                 update_old=False,
                 pane_name=PANE_NAME):
    sections = []
    for link in soup.find(class_=pane_name).findAll('li'):
        if 'summary-and-analysis' in link.a['href']:
            sections.append(link)

    section_urls = []
    for section in sections[1:]:
        name = section.span.text
        url = urllib.parse.urljoin(base_url, section.a['href'])
        if archived:
            url = get_archived(get_orig_url(url), update_old)
        section_urls.append((name, url))

    return section_urls

Example #5

0

Show file

def process_story_link(link, archived, update_old):
    soup = get_soup(link)
    stories = soup.find_all('a', text=RE_STORY)
    if not stories:
        return None
    section_summs = []
    for story in stories:  # a story page has multiple chapters
        href = story.get('href')
        ## For page http://www.pinkmonkey.com/booknotes/barrons/billbud.asp , we want Typee, but not Billy Budd
        if not href or href.startswith('billbud'):
            continue
        if archived:
            url = urllib.parse.urljoin(get_orig_url(link), href)
            url = get_archived(url, update_old)
        else:
            url = urllib.parse.urljoin(link, href)
        summs = process_story(url)
        if summs:
            section_summs.extend(summs)
    return section_summs

Example #6

0

Show file

File: cliffsnotes_scrape.py Project: manestay/novel-chapter-dataset

def get_plot_summary(soup,
                     base_url,
                     archived=False,
                     update_old=False,
                     pane_name=PANE_NAME):
    summaries = []
    for link in soup.find(class_=pane_name).findAll('li'):
        if 'summary' in link.text.lower():
            summaries.append(link)

    if len(summaries) > 1:
        # Assume that first one is overall book/play summary. Hold for most cases
        href = summaries[0].a['href']
        link = urllib.parse.urljoin(base_url, href)
        if archived:
            link = get_archived(get_orig_url(href), update_old)
        plot_summary = get_section_summary(link, base_url, archived,
                                           update_old)
        return plot_summary
    else:
        return None

Example #7

0

Show file

def get_plot_summary(url, sleep=SLEEP):
    soup = get_soup(url, sleep=sleep)

    pagination = soup.find(class_='pagination-links') or soup.find(
        class_='interior-sticky-nav__navigation__list--short')
    assert pagination is None

    studyguide = soup.find('div', {'class': 'studyGuideText'})
    if not studyguide:
        soup = get_soup(url, sleep=sleep)
        studyguide = soup.find('div', {'class': 'studyGuideText'})
    if not studyguide:
        archived_url = get_archived(url)
        print(
            'WARNING: study guide not found for {} , trying archived version from {}'
            .format(url, archived_url))
        soup = get_soup(archived_url, sleep=sleep)
        studyguide = soup.find('div', {'class': 'studyGuideText'})
    if studyguide and not studyguide.findAll(H3H4):
        return _get_paragraphs(soup)
    else:
        return get_section_summary(url)[1]

Example #8

0

Show file

File: novelguide_scrape.py Project: manestay/novel-chapter-dataset

        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries


if __name__ == "__main__":
    args = parser.parse_args()
    catalog = load_catalog(CATALOG_NAME)
    if args.full:
        title_set = None
    else:
        print('limiting to books from', CATALOG_NAME)
        title_set = set(catalog.keys())

    if args.archived_list:
        books_list = get_archived(BOOKS_LIST)
    else:
        books_list = BOOKS_LIST
    title_url_map = get_title_url_map(books_list, title_set=title_set)
    print('{} book pages total'.format(len(title_url_map)))
    book_summaries = get_summaries(title_url_map, args.out_name,
                                   args.use_pickled, args.archived,
                                   args.update_old, args.save_every,
                                   args.sleep)
    # with open(args.out_name, 'rb') as f:
    #     book_summaries = pickle.load(f)

    book_summaries_overlap = gen_gutenberg_overlap(book_summaries,
                                                   catalog,
                                                   filter_plays=True)
    book_summaries_overlap = manual_fix(book_summaries_overlap)

Example #9

0

Show file

def get_summaries(books_list,
                  base_url,
                  out_name,
                  pane_name,
                  use_pickled=False,
                  title_set=None,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name) and os.path.getsize(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    soup = get_soup(books_list)
    title_url_map = {}
    for link in soup.find(class_='alphabits').findAll('li'):
        page_url = urllib.parse.urljoin(base_url, link.a['href'])
        soup = get_soup(page_url)
        for book in soup.find(class_='columnList').findAll('li'):
            title = book.a.text.strip()
            if title_set and title not in title_set:
                continue
            url = urllib.parse.urljoin(base_url, book.a['href'])
            title_url_map[title] = url

    print('found {} books'.format(len(title_url_map)))
    for i, (book, url) in enumerate(title_url_map.items()):
        if book in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            url = get_archived(url, update_old)
        print('processing {} {}'.format(book, url))
        soup = get_soup(url)
        author = get_author(soup)
        plot_overview = get_plot_summary(soup, pane_name, base_url, archived,
                                         update_old)

        section_summaries = []
        sections = get_sections(soup, pane_name, base_url, archived,
                                update_old)
        for (section_name, url) in sections:
            summary = get_section_summary(url)
            section_summaries.append((section_name, summary))
        bs = BookSummary(
            title=book,
            author=author,
            genre=
            None,  # TODO: Need to fix this and get genre from external source
            plot_overview=plot_overview,
            source='gradesaver',
            section_summaries=section_summaries)
        book_summaries.append(bs)
        num_books = len(book_summaries)

        if num_books > 1 and num_books % save_every == 0:
            print("Done scraping {} books".format(num_books))
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)

    print('Scraped {} books from gradesaver'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Example #10

0

Show file

def process_next_link(link, archived, update_old):
    soup = get_soup(link)

    chapters = find_all_stripped('a', soup, RE_CHAP)
    if 'pmEthanFrome' in link:
        chapters += soup.find_all('a', text=RE_OPEN)
    elif 'pmDubliners' in link:
        h3s = soup.find_all('h3')
        for h3 in h3s:
            if h3.text.startswith('Short Story'):
                chapters = h3.find_next_sibling('p').find_all('a')
    elif 'wutherg' in link:
        if chapters[-3]['href'] != 'wutherg47.asp':
            chapters[-3]['href'] = 'wutherg47.asp'
    elif 'pmJungle' in link:
        if chapters[3]['href'] != 'pmJungle20.asp':
            chapters[3]['href'] = 'pmJungle20.asp'
        if chapters[9]['href'] != 'pmJungle31.asp':
            chapters[9]['href'] = 'pmJungle31.asp'
    if not chapters:
        return None
    section_summs = []
    url_title_map = {}
    seen_urls = set()
    for c in chapters:
        href = c.get('href')
        title = get_clean_text(c)
        title = title if 'pmBabbitt' not in link else ''
        url = urllib.parse.urljoin(link, href)
        orig_url = url
        if 'dpbolvw' in url:
            continue
        dead_links1 = set(['pmVanity'])
        dead_links2 = set([
            'pmPrincePauper', 'pmIdiot', 'pmFatherSon', 'pmGreenwood',
            'pmOfHuman'
        ])
        dead_links3 = set(['pmDeerSlayer', 'pmTypee'])
        is_dead1 = any(x in orig_url for x in dead_links1)
        is_dead2 = any(x in orig_url for x in dead_links2)
        is_dead3 = any(x in orig_url for x in dead_links3)
        if is_dead1 or is_dead2 or is_dead3:
            # http://www.pinkmonkey.com:80/booknotes/monkeynotes/pmIdiot16.asp and up pages are dead
            # likewise for other strings
            page_no = int(re.findall('\d+', orig_url)[-1])
            if is_dead1 and page_no >= 17:
                continue
            elif is_dead2 and page_no >= 16:
                continue
            elif is_dead3 and page_no >= 13:
                continue
        if orig_url in seen_urls:
            continue
        if archived:
            orig_url = urllib.parse.urljoin(get_orig_url(link), c.get('href'))
            url = get_archived(orig_url, update_old)
        url_title_map[url] = title
        seen_urls.add(orig_url)

    for url, title in url_title_map.items():
        summs = process_story(url, title)
        for summ in summs:
            # print(' ', summ[0])
            if summ[1]:  # not empty text
                section_summs.append(summ)

    # manual fixes
    extra_sections = []
    if 'pmWinesburg' in link:
        extra_sections = [
            "pmWinesburg20.asp", "pmWinesburg21.asp", "pmWinesburg22.asp"
        ]
    elif 'pmDubliners' in link:
        extra_sections = [
            "pmDubliners12.asp", "pmDubliners16.asp"
        ]  # pmDubliners57.asp has no "Summary" heading, so skip
    if extra_sections:
        if archived:
            links_addtl = [
                get_archived(urllib.parse.urljoin(get_orig_url(link), href),
                             update_old) for href in extra_sections
            ]
        else:
            links_addtl = [
                urllib.parse.urljoin(link, x) for x in extra_sections
            ]
        sect_summs_addtl = [process_story(x) for x in links_addtl]
        sect_summs_addtl = [x[0] for x in sect_summs_addtl]
        section_summs.extend(sect_summs_addtl)
    return section_summs

Example #11

0

Show file

File: cliffsnotes_scrape.py Project: manestay/novel-chapter-dataset

def get_summaries(books_list,
                  base_url,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  title_set=None,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()
    soup = get_soup(books_list)
    title_url_map = {}
    for book in soup.find(class_='content active').findAll('li'):
        title = book.find('h4').text.strip()
        if title_set and title not in title_set:
            continue
        url = urllib.parse.urljoin(BASE_URL, book.a['href'])
        title_url_map[title] = url
    print('found {} books'.format(len(title_url_map)))
    for i, (book, url) in enumerate(title_url_map.items()):
        if book in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            url = get_orig_url(url)
            url = get_archived(url, update_old)
        print('processing {} {}'.format(book, url))
        soup = get_soup(url)
        author = get_author(soup)
        if not author:
            print('author not found, skipping', book, url)
            continue
        plot_overview = get_plot_summary(soup, base_url, archived, update_old)
        plot_overview = ''

        section_summaries = []
        for (section_name, url) in get_sections(soup, base_url, archived,
                                                update_old):
            orig_url = url
            if archived:
                url = get_archived(get_orig_url(url), update_old)
            summary = get_section_summary(url, base_url, archived, update_old)
            section_summaries.append((section_name, summary))
        bs = BookSummary(
            title=book,
            author=author,
            genre=None,  # TODO: Implement retrieving genre from external source
            plot_overview=plot_overview,
            source='cliffsnotes',
            section_summaries=section_summaries)

        book_summaries.append(bs)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from cliffsnotes'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Example #12

0

Show file

File: cliffsnotes_scrape.py Project: manestay/novel-chapter-dataset

def get_section_summary(url, base_url, archived=False, update_old=False):
    sense37, analysis_count = False, 0  # manual fix for this page with 2 Analysis headings
    if 'https://www.cliffsnotes.com/literature/s/sense-and-sensibility/summary-and-analysis/chapter-37' in url:
        sense37 = True
    analysis_found = False
    soup_all = get_soup(url)
    soup = soup_all.find(class_='copy')
    if not soup:  # this happens if out of date, need to update the archive.org version
        print(f'{url} NO COPY CLASS!')
        return []
    children = list(soup.children)

    section_summary = []
    for i, child in enumerate(children):
        try:
            if len(child.findAll('p')) > 0:
                for c in child.children:
                    try:
                        if c.name == 'p':
                            text = c.text.strip()
                            if text == 'Analysis':
                                analysis_found = True
                                raise BreakIt
                            if len(text) > 0 and text != 'Summary':
                                section_summary.append(text)
                    except AttributeError:
                        continue
            elif child.name == 'p':
                text = child.text.strip()
                if sense37 and text == 'Analysis':
                    sense37 = False
                    continue
                elif text == 'Analysis':
                    analysis_found = True
                    break
                if len(text) > 0 and text != 'Summary':
                    section_summary.append(text)
            elif child.name == 'h2' or child.name == 'h3':
                text = child.text.strip()
                if text == 'Analysis':
                    analysis_found = True
                    break
        except AttributeError:
            continue
        except BreakIt:
            break
    if len(section_summary) > 0 and not analysis_found:
        next_soup = soup_all.find(class_='small-6 columns clear-padding-right')
        if not next_soup:
            return section_summary
        href = next_soup.a['href']
        if href.endswith('character-list'):
            return section_summary
        # if 'book-summary-2' in href: # TODO: delete this
        #     next_url = 'https://' + get_orig_url(href)
        elif archived:
            next_url = get_archived(get_orig_url(href), update_old)
        else:
            next_url = urllib.parse.urljoin(base_url, href)

        is_continued = 'continued on next page' in section_summary[-1].lower()
        if is_continued:
            del section_summary[-1]
        cond = next_url.startswith(url)
        if is_continued or cond:
            soup = get_soup(next_url)
            try:
                summary = get_section_summary(next_url, base_url, archived,
                                              update_old)
                section_summary.extend(summary)
            except IndexError:
                pass
    return section_summary

Example #13

0

Show file

File: bookwolf_scrape.py Project: manestay/novel-chapter-dataset

def get_summaries(title_url_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for title, url in title_url_map.items():  # iterate through books
        if title in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            orig_url = url
            url = get_archived(url, update_old)
        print('processing', title, url)
        author = ''  # TODO: figure this out
        soup = get_soup(url)
        contents = soup.find('table', id='Table56')
        if contents:
            idx = 3
        else:
            contents = soup.find('table', width='99%')
            idx = 4
        if not contents:
            print('table of contents not found on ', url)
            continue

        cells = contents.find('tbody').find_all(
            'tr', recursive=False)[idx].find_all('a')
        cells = [x for x in cells if num_in(get_clean_text(x))]
        if not cells:
            print('no chapters found for ', url)
            continue

        sects = []
        for c in cells:  # iterate through sections
            text = get_clean_text(c)
            if 'Interpretation' in text:
                continue
            href = c['href']
            link_summ = urllib.parse.urljoin(url, href)
            if archived:
                if '/' not in href:
                    orig_url = urllib.parse.urljoin(get_orig_url(url), href)
                else:
                    orig_url = get_orig_url(href)
                link_summ = get_archived(orig_url, update_old)
            paras = process_chapter(link_summ)
            if not paras:
                print('no summaries found on ', link_summ)
                continue
            text = standardize_section_titles(text)
            sects.append((text, paras))
        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=None,
                                source='bookwolf',
                                section_summaries=sects)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from bookwolf'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Example #14

0

Show file

File: novelguide_scrape.py Project: manestay/novel-chapter-dataset

def get_summaries(title_url_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for title, url in title_url_map.items():
        title = title.replace("DeerSlayer", 'Deerslayer', 1)
        if title in done:
            continue
        if sleep:
            time.sleep(sleep)
        author = ''  # TODO: figure this out
        archived_local = archived
        if archived:
            orig_url = url
            url = get_archived(url, update_old)
        print('processing', title, url)
        soup = get_soup(url, sleep=SLEEP)
        table = soup.find('div', id='block-booknavigation-3') or soup.find(
            'div', id='block-block-4')

        # process plot summary
        plot_summ = None
        plot_cell = table.find('a', href=RE_PLOT_LINK)
        if plot_cell:
            plot_title = plot_cell.get_text()
            href = plot_cell['href']
            if archived:
                plot_link = get_orig_url(href)
                plot_link = get_archived(plot_link, update_old)
                if 'archive.org' not in plot_link:  # failed to retrieve archived version
                    # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted
                    time.sleep(5.0)
                    archived_local = False
            else:
                plot_link = urllib.parse.urljoin(url, href)
            if 'Chapter' not in plot_title:
                plot_summ = process_plot(plot_link)
            if not plot_summ:
                print('  no plot summary found', plot_link)

        # process section summaries
        cells = table.find_all('a', href=RE_SUMM_LINK)
        if title == "The Brothers Karamazov":
            cells = sort_cells(cells)
        section_summs = []

        if not cells:
            print('  no section links found for', url)
            continue

        seen_sects = set()
        for c in cells:
            section_title = get_clean_text(c)
            section_title_chap = section_title.rsplit(':', 1)[-1]
            if section_title_chap in seen_sects:
                print('  seen {} already, skipped'.format(section_title_chap))
                continue
            if re.match(RE_PLOT, section_title):
                continue

            if archived and archived_local:
                link_summ = get_orig_url(c['href'])
                link_summ = get_archived(link_summ, update_old)
            else:
                link_summ = urllib.parse.urljoin(url, c['href'])

            try:
                page_summs = process_story(link_summ)
            except AttributeError:  # page failed to load, try again
                print('  retrying after 5 seconds...')
                time.sleep(5.0)
                page_summs = process_story(link_summ)

            if page_summs:
                section_summs.extend(page_summs)
                seen_sects.add(section_title_chap)
        if not section_summs:
            print('  could not find summaries for {}'.format(title))
            continue
        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=plot_summ,
                                source='novelguide',
                                section_summaries=section_summs)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from novelguide'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Example #15

0

Show file

def get_summaries(guides_page,
                  base_url,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  title_set=None,
                  sleep=SLEEP,
                  flatten=True):
    def add_summaries(url, section_summaries, flatten=True):
        # helper function
        summary_obj = get_section_summary(url, archived, update_old)
        multisect_title, sect_summs = summary_obj
        logging.info(multisect_title)
        if flatten:
            for sect_summ in sect_summs:
                sect_title, sect_paras = sect_summ
                if sect_title == 'Summary':
                    sect_title = multisect_title
                if re.match(RE_ANALYSIS, sect_title):
                    continue
                logging.info(sect_title)
                summary_obj_new = (sect_title, sect_paras)
                section_summaries.append(summary_obj_new)
        else:
            section_summaries.append(summary_obj)

    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    soup = get_soup(guides_page, sleep=sleep)
    title_url_map = {}
    for section in soup.findAll('section'):
        for book in section.findAll('h4'):
            title = book.a.text.strip()
            if title_set and title not in title_set:
                continue
            url = urllib.parse.urljoin(base_url, book.a['href'])
            title_url_map[title] = url

    print('found {} books'.format(len(title_url_map)))
    for i, (book, url) in enumerate(title_url_map.items()):
        if book in done:
            continue
        if archived:
            url = get_archived(url, update_old)
        print('processing {} {}'.format(book, url))
        soup = get_soup(url, sleep=sleep)
        author = get_author(soup)
        if not author:
            print('author not found, skipping', book, url)
            continue
        plot_url, section_urls = get_plot_section_urls(url, base_url, archived,
                                                       update_old)
        if plot_url:
            plot_overview = get_plot_summary(plot_url)
        else:
            plot_overview = None

        section_summaries = []
        for url in section_urls:
            add_summaries(url, section_summaries)
        if book == 'The Yellow Wallpaper':
            section_summaries = [('Book', plot_overview)]
        if not section_summaries:
            continue
        bs = BookSummary(
            title=book,
            author=author,
            genre=None,  # TODO: get genre from external source
            plot_overview=plot_overview,
            source='sparknotes',
            section_summaries=section_summaries)

        book_summaries.append(bs)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from sparknotes'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Example #16

0

Show file

def get_section_summary(section_url,
                        archived=False,
                        update_old=False,
                        retry=0,
                        sleep=SLEEP):
    def _get_type(child):
        name = child.name if child.name in H3H4 or child.name == 'p' else None
        return name

    def _get_first(soup):
        summary = soup.find('div', {'class': 'studyGuideText'})
        page_elements = list(summary.children)

        def _increment_ind(
            ind,
            page_elements=page_elements,
        ):
            while ind < len(page_elements) and _get_type(
                    page_elements[ind]) is None:
                ind += 1
            return ind

        ind = _increment_ind(0)
        elem = page_elements[ind]

        paragraphs = []
        while _get_type(elem) == 'p':
            paragraphs.append(elem.text.strip().replace('\n', ' '))
            ind = _increment_ind(ind + 1)
            elem = page_elements[ind]
        return paragraphs, ind

    def _scrape_page(soup, ind=0):
        sub_section_summaries = []
        summary = soup.find('div', {'class': 'studyGuideText'})
        page_elements = list(summary.children)

        def _increment_ind(ind, page_elements=page_elements):
            while ind < len(page_elements) and _get_type(
                    page_elements[ind]) is None:
                ind += 1
            return ind

        # reached first subsection heading
        while ind < len(page_elements):
            ind = _increment_ind(ind)
            elem = page_elements[ind]
            el_type = _get_type(elem)
            assert el_type == 'h3' or el_type == 'h4'
            sub_section_name = elem.text.strip()

            ind = _increment_ind(ind + 1)
            elem = page_elements[ind]
            paragraphs = []
            while _get_type(elem) == 'p':
                paragraphs.append(elem.text.strip().replace('\n', ' '))
                ind = _increment_ind(ind + 1)
                if ind == len(page_elements):
                    break
                elem = page_elements[ind]

            sub_section_summaries.append((sub_section_name, paragraphs))

        return sub_section_summaries

    # scrape main page
    soup = get_soup(section_url, sleep=sleep)
    title_tag = soup.find(
        class_='interior-header__title__pagetitle') or soup.find('h2')
    ERRORS = set(['Something bad happened. Sorry.', 'read ECONNRESET'])
    is_error_page = not title_tag or title_tag.text in ERRORS
    if retry == 0 and is_error_page:
        return get_section_summary(section_url, archived, update_old, retry=1)
    # elif retry == 1 and is_error_page:
    #     archived_url = get_archived(section_url, update_old)
    #     print('WARNING: could not load page {} , trying archived version from {}'.format(section_url, archived_url))
    # return get_section_summary(archived_url, archived, update_old, retry=2)
    elif is_error_page:
        print('could not process {}'.format(section_url))
        os._exit(-1)
    section_name = title_tag.text.strip()
    studyguide = soup.find('div', {'class': 'studyGuideText'})
    if not studyguide.findAll(H3H4):
        paragraphs = _get_paragraphs(soup)
        summaries = [(section_name, paragraphs)]
    else:
        # skip any initial notes
        paragraphs, ind = _get_first(soup)
        summaries = _scrape_page(soup, ind)
    # scrape other pages, if any
    pagination = soup.find(class_='pagination-links') or \
                 soup.find(class_='interior-sticky-nav__navigation__list--short') or \
                 soup.find(class_='interior-sticky-nav__navigation')
    # # TODO: we can use below logic if sparknotes fixes www.sparknotes.com/lit/crime/section10/ ,
    # # which has chapters 1-4 on page 2, then chapter 5 on page 3
    # if summaries:
    #     at_analysis = re.match(RE_ANALYSIS, summaries[-1][0])
    # else:
    #     at_analysis = False
    # if not at_analysis and pagination is not None:
    if pagination is not None:
        pages = pagination.findAll('a')
        for page in pages[1:]:
            page_url = urllib.parse.urljoin(section_url, page['href'])
            if archived:
                orig_url = urllib.parse.urljoin(get_orig_url(section_url),
                                                page['href'])
                page_url = get_archived(orig_url, update_old)
                page_url = page_url.replace(
                    '/https://', '/', 1)  # avoid strange bug with archive.org
            soup = get_soup(page_url, sleep=sleep)
            studyguide = soup.find('div', {'class': 'studyGuideText'})
            if not studyguide:
                soup = get_soup(page_url, sleep=sleep)
                studyguide = soup.find('div', {'class': 'studyGuideText'})
            # if not studyguide:
            #     archived_url = get_archived(page_url)
            #     print('WARNING: study guide not found for {} , trying archived version from {}'.format(page_url, archived_url))
            #     soup = get_soup(archived_url, sleep=sleep)
            #     studyguide = soup.find('div', {'class': 'studyGuideText'})
            if studyguide and not studyguide.findAll(H3H4):
                # no sub-sections, so get all paragraphs and add to previous
                paragraphs = _get_paragraphs(soup)
                summaries[-1][1].extend(paragraphs)
            else:
                # get paragraphs before first subsection
                paragraphs, ind = _get_first(soup)
                summaries[-1][1].extend(paragraphs)
                page_summaries = _scrape_page(soup, ind=ind)
                summaries.extend(page_summaries)
    return section_name, summaries

Example #17

0

Show file

def get_summaries(page_title_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([(x.title, x.source) for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for page, title in page_title_map.items():
        if 'barrons' in page.lower():
            source = 'barrons'
        elif 'monkeynotes' in page.lower():
            source = 'monkeynotes'
        if (title, source) in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            page = get_archived(page, update_old)
        print('processing', title, page)
        author = ''  # TODO: figure this out

        soup_book = get_soup(page)

        next_link = soup_book.find('a', text=RE_NEXT)
        story_link = soup_book.find('a', text=RE_STORY)
        is_hard_times = 'pinkmonkey.com/booknotes/barrons/hardtms.asp' in page

        if not (next_link or story_link or is_hard_times):
            print('cannot find any summaries for ', page)
            continue
        if is_paywall(page):
            print('    page is under a paywall, will be more errors: ', page)
            # continue

        if next_link:  # monkeynotes
            href = next_link.get('href')
            url = urllib.parse.urljoin(get_orig_url(page), href)
            url = get_archived(url, update_old)
            sect_summs = process_next_link(url, archived, update_old)
        elif story_link:  # barrons (most)
            url = page
            sect_summs = process_story_link(url, archived, update_old)
        elif is_hard_times:
            url = page
            sect_summs = process_next_link(url, archived, update_old)
        else:
            print('error')
            sys.exit()

        if not sect_summs:
            print('    Cannot process {}'.format(url))
            # NOTE: expected to reach here for barrons Oliver Twist and barrons The Secret Sharer
            continue

        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=None,
                                source=source,
                                section_summaries=sect_summs)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from pinkmonkey'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries