Ejemplos de get_soup en Python, ejemplos de scrape_lib.get_soup en Python

Ejemplo n.º 1

0

Mostrar archivo

def get_pages_titles(index_pages, books_list, title_set=None):
    book_pages = []
    titles = []
    for page_link in sorted(index_pages):  # get book pages
        soup_page = get_soup(page_link)
        book_links = soup_page.find_all(
            'p')  # 1 p element has 1 or more book listings
        for b in book_links:
            for elem in b.find_all('a'):
                href = elem.get('href')
                basename = href.rsplit('/', 1)[-1]
                if href.endswith(
                        '.asp'
                ) and 'notes' not in basename and 'first.asp' != basename:
                    title = get_clean_text(elem)
                    title = title.replace('Downloadable/Printable Version', '')
                    title = title.replace('&nbsp', '')
                    if not title or title == 'Quotes' or title == 'Quotations' or title.startswith(
                            'Read the'):
                        continue
                    if title_set and title not in title_set:
                        continue
                    book_pages.append(href)
                    titles.append(title)
    book_pages = get_absolute_links(book_pages, books_list)
    return book_pages, titles

Ejemplo n.º 2

0

Mostrar archivo

def is_paywall(link):
    soup = get_soup(link)
    for p in soup.find_all('p'):
        p_text = get_clean_text(p)
        if p_text.startswith('NOTICE: Unfortunately'):
            return True
    return False

Ejemplo n.º 3

0

Mostrar archivo

Archivo: novelguide_scrape.py Proyecto: manestay/novel-chapter-dataset

def get_title_url_map(books_list, title_set=None):
    soup = get_soup(books_list, sleep=SLEEP)
    # book_links = soup.find('table', class_='views-table cols-2').find_all('a')
    book_links = soup.find('table', class_='cols-2').find_all('a')
    title_url_map = {}
    for link in book_links:
        title = get_clean_text(link).replace(' Study Guide', '')
        if title_set and title not in title_set:
            continue
        link = link.get('href')
        title_url_map[title] = urllib.parse.urljoin(books_list, link)
    return title_url_map

Ejemplo n.º 4

0

Mostrar archivo

Archivo: scrape_google.py Proyecto: kaneeldias/news-bias

def find_articles(search_query, domain):
    full_links = []

    for page in range(1, 4):
        print("Fetching page ", page)
        url = get_url(search_query, domain, page)
        soup = scrape_lib.get_soup(url)
        links = get_links(soup)
        links = filter_links(links, domain)
        full_links = full_links + links

    return full_links

Ejemplo n.º 5

0

Mostrar archivo

def get_plot_summary(url, sleep=SLEEP):
    soup = get_soup(url, sleep=sleep)

    pagination = soup.find(class_='pagination-links') or soup.find(
        class_='interior-sticky-nav__navigation__list--short')
    assert pagination is None

    studyguide = soup.find('div', {'class': 'studyGuideText'})
    if not studyguide:
        soup = get_soup(url, sleep=sleep)
        studyguide = soup.find('div', {'class': 'studyGuideText'})
    if not studyguide:
        archived_url = get_archived(url)
        print(
            'WARNING: study guide not found for {} , trying archived version from {}'
            .format(url, archived_url))
        soup = get_soup(archived_url, sleep=sleep)
        studyguide = soup.find('div', {'class': 'studyGuideText'})
    if studyguide and not studyguide.findAll(H3H4):
        return _get_paragraphs(soup)
    else:
        return get_section_summary(url)[1]

Ejemplo n.º 6

0

Mostrar archivo

Archivo: bookwolf_scrape.py Proyecto: manestay/novel-chapter-dataset

def get_title_url_map(books_list, title_set=None):
    soup = get_soup(books_list)
    columns = soup.find_all('table', width=None)[1].find_all('table')

    title_url_map = {}
    for column in columns:
        cells = column.find_all('tr')
        for cell in cells:
            p = cell.find('p')
            entries = p.find_all('a')
            for entry in entries:
                title = get_clean_text(entry)
                if title_set and title not in title_set:
                    continue
                href = entry.get('href')
                title_url_map[title] = urllib.parse.urljoin(books_list, href)
    return title_url_map

Ejemplo n.º 7

0

Mostrar archivo

Archivo: bookwolf_scrape.py Proyecto: manestay/novel-chapter-dataset

def process_chapter(link):
    soup = get_soup(link)
    summ_lines = find_all_stripped('p', soup, RE_SUMM) or find_all_stripped(
        'b', soup, RE_CONTEXT)
    # manual fixes
    if link == 'http://www.bookwolf.com/Free_Booknotes/King_Lear_free_booknotes/Act_1_Scene_1_-_King_Lear/act_1_scene_1_-_king_lear.html':
        summ_lines = find_all_stripped('p', soup, RE_LEAR)
    elif link == 'http://www.bookwolf.com/Free_Booknotes/Othello/Act_3_Scene_2_-_Othello_Bookno/act_3_scene_2_-_othello_bookno.html':
        summ_lines = find_all_stripped('p', soup, 'ACT III – Scene.ii')
    if len(summ_lines) > 1:
        print('error, more than 1 summ line: ', link)
        return
    elif not summ_lines:
        print('no summ lines found: ', link)
        return
    ps = summ_lines[0].find_all_next('p')
    paragraphs = process_paragraphs(ps)
    return paragraphs

Ejemplo n.º 8

0

Mostrar archivo

def get_index_pages(books_list, source):
    soup = get_soup(books_list)
    if source == 'monkeynotes':
        tables = soup.find_all('font',
                               color='#339900',
                               face='Arial, Helvetica')
    elif source == 'barrons':
        tables = soup.find_all('font',
                               color='white',
                               face='Verdana, Arial, Helvetica, sans-serif')

    index_pages = []
    for table in tables:  # get index pages for each letter
        entries = table.find_all('a')
        for entry in entries:
            href = entry.get('href')
            index_pages.append(href)
    index_pages = set(urllib.parse.urljoin(books_list, x) for x in index_pages)
    return index_pages

Ejemplo n.º 9

0

Mostrar archivo

Archivo: novelguide_scrape.py Proyecto: manestay/novel-chapter-dataset

def process_plot(link):
    plot_summ = []
    soup = get_soup(link, sleep=SLEEP)
    content = soup.find('div', id='content-content')
    paras = content.find_all('p')
    for p in paras:
        text = get_clean_text(p, strip=False)
        bold = p.find(['b', 'strong'])
        if bold:
            if bold.get_text() == 'Analysis':
                break
            sibs = list(bold.next_siblings)
            if sibs:
                text = str(sibs[-1])
            else:
                continue
        if p and not text.startswith('Log in'):
            plot_summ.append(text)
    return plot_summ

Ejemplo n.º 10

0

Mostrar archivo

def get_plot_section_urls(url,
                          base_url=BASE_URL,
                          archived=False,
                          update_old=False,
                          sleep=SLEEP):
    soup = get_soup(url, sleep=sleep)
    plot_url = urllib.parse.urljoin(url, 'summary/')
    status_code = requests.get(plot_url).status_code
    if status_code in set([404, 500]):
        try:
            plot_url = get_archived(plot_url)
        except requests.exceptions.HTTPError as e:
            print('plot url not found at', plot_url)
            plot_url = ''

    section_urls = []
    seen = set()
    # lists = soup.find_all(class_='lists')
    # litems = lists[0].find_all('li')
    lists = soup.find_all(class_='landing-page__umbrella__section__list')
    litems = lists[1].find_all('li')
    if not litems:
        litems = lists[1].findAll('li')
    if len(litems) == 1:
        litems = lists[2].findAll('li')
    for item in litems:
        if not item.a:
            continue
        href = item.a.get('href')
        # if not href:
        #     pass
        if 'section' in href:
            if archived:
                orig_url = get_orig_url(href)
                url = get_archived(orig_url, update_old)
            else:
                url = urllib.parse.urljoin(base_url, item.a['href'])
                orig_url = url
            if orig_url not in seen:
                section_urls.append(url)
                seen.add(orig_url)

    return plot_url, section_urls

Ejemplo n.º 11

0

Mostrar archivo

def get_section_summary(url):
    soup = get_soup(url)
    children = list(soup.find(class_='section__article').children)

    def _is_heading(child):
        if child.name not in ['h2', 'h3', 'h4', 'p']:
            return False
        words = child.text.lower().strip().split()
        if child.name in ['h2', 'h3', 'h4']:
            return True
        elif child.strong is not None:
            return True
        elif child.name == 'p' and len(words) < 20:
            if any(heading in words for heading in HEADINGS):
                return True
            else:
                return False

    section_summary = []
    ind = 0
    while ind < len(children):
        if not _is_heading(children[ind]) and children[ind].name != 'p':
            ind += 1
            continue
        #New sub-section
        if _is_heading(children[ind]):
            sub_section_name = children[ind].text.strip()
            ind += 1
        else:
            sub_section_name = None

        subsection = []
        while ind < len(children) and not _is_heading(children[ind]):
            if children[ind].name == 'p':
                subsection.append(children[ind].text.strip())
            ind += 1

        if sub_section_name and 'analysis' in sub_section_name.lower():
            continue
        section_summary.append((sub_section_name, subsection))

    return section_summary

Ejemplo n.º 12

0

Mostrar archivo

def process_story_link(link, archived, update_old):
    soup = get_soup(link)
    stories = soup.find_all('a', text=RE_STORY)
    if not stories:
        return None
    section_summs = []
    for story in stories:  # a story page has multiple chapters
        href = story.get('href')
        ## For page http://www.pinkmonkey.com/booknotes/barrons/billbud.asp , we want Typee, but not Billy Budd
        if not href or href.startswith('billbud'):
            continue
        if archived:
            url = urllib.parse.urljoin(get_orig_url(link), href)
            url = get_archived(url, update_old)
        else:
            url = urllib.parse.urljoin(link, href)
        summs = process_story(url)
        if summs:
            section_summs.extend(summs)
    return section_summs

Ejemplo n.º 13

0

Mostrar archivo

def get_summaries(books_list,
                  base_url,
                  out_name,
                  pane_name,
                  use_pickled=False,
                  title_set=None,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name) and os.path.getsize(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    soup = get_soup(books_list)
    title_url_map = {}
    for link in soup.find(class_='alphabits').findAll('li'):
        page_url = urllib.parse.urljoin(base_url, link.a['href'])
        soup = get_soup(page_url)
        for book in soup.find(class_='columnList').findAll('li'):
            title = book.a.text.strip()
            if title_set and title not in title_set:
                continue
            url = urllib.parse.urljoin(base_url, book.a['href'])
            title_url_map[title] = url

    print('found {} books'.format(len(title_url_map)))
    for i, (book, url) in enumerate(title_url_map.items()):
        if book in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            url = get_archived(url, update_old)
        print('processing {} {}'.format(book, url))
        soup = get_soup(url)
        author = get_author(soup)
        plot_overview = get_plot_summary(soup, pane_name, base_url, archived,
                                         update_old)

        section_summaries = []
        sections = get_sections(soup, pane_name, base_url, archived,
                                update_old)
        for (section_name, url) in sections:
            summary = get_section_summary(url)
            section_summaries.append((section_name, summary))
        bs = BookSummary(
            title=book,
            author=author,
            genre=
            None,  # TODO: Need to fix this and get genre from external source
            plot_overview=plot_overview,
            source='gradesaver',
            section_summaries=section_summaries)
        book_summaries.append(bs)
        num_books = len(book_summaries)

        if num_books > 1 and num_books % save_every == 0:
            print("Done scraping {} books".format(num_books))
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)

    print('Scraped {} books from gradesaver'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Ejemplo n.º 14

0

Mostrar archivo

Archivo: novelguide_scrape.py Proyecto: manestay/novel-chapter-dataset

def get_summaries(title_url_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for title, url in title_url_map.items():
        title = title.replace("DeerSlayer", 'Deerslayer', 1)
        if title in done:
            continue
        if sleep:
            time.sleep(sleep)
        author = ''  # TODO: figure this out
        archived_local = archived
        if archived:
            orig_url = url
            url = get_archived(url, update_old)
        print('processing', title, url)
        soup = get_soup(url, sleep=SLEEP)
        table = soup.find('div', id='block-booknavigation-3') or soup.find(
            'div', id='block-block-4')

        # process plot summary
        plot_summ = None
        plot_cell = table.find('a', href=RE_PLOT_LINK)
        if plot_cell:
            plot_title = plot_cell.get_text()
            href = plot_cell['href']
            if archived:
                plot_link = get_orig_url(href)
                plot_link = get_archived(plot_link, update_old)
                if 'archive.org' not in plot_link:  # failed to retrieve archived version
                    # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted
                    time.sleep(5.0)
                    archived_local = False
            else:
                plot_link = urllib.parse.urljoin(url, href)
            if 'Chapter' not in plot_title:
                plot_summ = process_plot(plot_link)
            if not plot_summ:
                print('  no plot summary found', plot_link)

        # process section summaries
        cells = table.find_all('a', href=RE_SUMM_LINK)
        if title == "The Brothers Karamazov":
            cells = sort_cells(cells)
        section_summs = []

        if not cells:
            print('  no section links found for', url)
            continue

        seen_sects = set()
        for c in cells:
            section_title = get_clean_text(c)
            section_title_chap = section_title.rsplit(':', 1)[-1]
            if section_title_chap in seen_sects:
                print('  seen {} already, skipped'.format(section_title_chap))
                continue
            if re.match(RE_PLOT, section_title):
                continue

            if archived and archived_local:
                link_summ = get_orig_url(c['href'])
                link_summ = get_archived(link_summ, update_old)
            else:
                link_summ = urllib.parse.urljoin(url, c['href'])

            try:
                page_summs = process_story(link_summ)
            except AttributeError:  # page failed to load, try again
                print('  retrying after 5 seconds...')
                time.sleep(5.0)
                page_summs = process_story(link_summ)

            if page_summs:
                section_summs.extend(page_summs)
                seen_sects.add(section_title_chap)
        if not section_summs:
            print('  could not find summaries for {}'.format(title))
            continue
        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=plot_summ,
                                source='novelguide',
                                section_summaries=section_summs)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from novelguide'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Ejemplo n.º 15

0

Mostrar archivo

Archivo: cliffsnotes_scrape.py Proyecto: manestay/novel-chapter-dataset

def get_summaries(books_list,
                  base_url,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  title_set=None,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()
    soup = get_soup(books_list)
    title_url_map = {}
    for book in soup.find(class_='content active').findAll('li'):
        title = book.find('h4').text.strip()
        if title_set and title not in title_set:
            continue
        url = urllib.parse.urljoin(BASE_URL, book.a['href'])
        title_url_map[title] = url
    print('found {} books'.format(len(title_url_map)))
    for i, (book, url) in enumerate(title_url_map.items()):
        if book in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            url = get_orig_url(url)
            url = get_archived(url, update_old)
        print('processing {} {}'.format(book, url))
        soup = get_soup(url)
        author = get_author(soup)
        if not author:
            print('author not found, skipping', book, url)
            continue
        plot_overview = get_plot_summary(soup, base_url, archived, update_old)
        plot_overview = ''

        section_summaries = []
        for (section_name, url) in get_sections(soup, base_url, archived,
                                                update_old):
            orig_url = url
            if archived:
                url = get_archived(get_orig_url(url), update_old)
            summary = get_section_summary(url, base_url, archived, update_old)
            section_summaries.append((section_name, summary))
        bs = BookSummary(
            title=book,
            author=author,
            genre=None,  # TODO: Implement retrieving genre from external source
            plot_overview=plot_overview,
            source='cliffsnotes',
            section_summaries=section_summaries)

        book_summaries.append(bs)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from cliffsnotes'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Ejemplo n.º 16

0

Mostrar archivo

Archivo: cliffsnotes_scrape.py Proyecto: manestay/novel-chapter-dataset

def get_section_summary(url, base_url, archived=False, update_old=False):
    sense37, analysis_count = False, 0  # manual fix for this page with 2 Analysis headings
    if 'https://www.cliffsnotes.com/literature/s/sense-and-sensibility/summary-and-analysis/chapter-37' in url:
        sense37 = True
    analysis_found = False
    soup_all = get_soup(url)
    soup = soup_all.find(class_='copy')
    if not soup:  # this happens if out of date, need to update the archive.org version
        print(f'{url} NO COPY CLASS!')
        return []
    children = list(soup.children)

    section_summary = []
    for i, child in enumerate(children):
        try:
            if len(child.findAll('p')) > 0:
                for c in child.children:
                    try:
                        if c.name == 'p':
                            text = c.text.strip()
                            if text == 'Analysis':
                                analysis_found = True
                                raise BreakIt
                            if len(text) > 0 and text != 'Summary':
                                section_summary.append(text)
                    except AttributeError:
                        continue
            elif child.name == 'p':
                text = child.text.strip()
                if sense37 and text == 'Analysis':
                    sense37 = False
                    continue
                elif text == 'Analysis':
                    analysis_found = True
                    break
                if len(text) > 0 and text != 'Summary':
                    section_summary.append(text)
            elif child.name == 'h2' or child.name == 'h3':
                text = child.text.strip()
                if text == 'Analysis':
                    analysis_found = True
                    break
        except AttributeError:
            continue
        except BreakIt:
            break
    if len(section_summary) > 0 and not analysis_found:
        next_soup = soup_all.find(class_='small-6 columns clear-padding-right')
        if not next_soup:
            return section_summary
        href = next_soup.a['href']
        if href.endswith('character-list'):
            return section_summary
        # if 'book-summary-2' in href: # TODO: delete this
        #     next_url = 'https://' + get_orig_url(href)
        elif archived:
            next_url = get_archived(get_orig_url(href), update_old)
        else:
            next_url = urllib.parse.urljoin(base_url, href)

        is_continued = 'continued on next page' in section_summary[-1].lower()
        if is_continued:
            del section_summary[-1]
        cond = next_url.startswith(url)
        if is_continued or cond:
            soup = get_soup(next_url)
            try:
                summary = get_section_summary(next_url, base_url, archived,
                                              update_old)
                section_summary.extend(summary)
            except IndexError:
                pass
    return section_summary

Ejemplo n.º 17

0

Mostrar archivo

Archivo: scraper.py Proyecto: kaneeldias/news-bias

 def get_info(self, url):
     soup = scrape_lib.get_soup(url)
     headline = self.get_headline(soup)
     published_date = self.get_published_date(soup)
     content = self.get_content(soup)
     return [str(headline), str(published_date), str(content)]

Ejemplo n.º 18

0

Mostrar archivo

def process_story(link, title=None, get_next=True, find_continued=False):
    """
    returns tuples of (title, summary list) format
    """
    soup = get_soup(link)
    chapters = []
    if find_continued:
        lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM_CONTINUED)
        if not lines:
            return []
    ### specific edge cases
    elif 'WhiteFang' in link:
        lines = find_all_stripped(
            ['p', 'h4'], soup, RE_CHAP) + find_all_stripped(['p', 'h4'], soup,
                                                            RE_SUMM)
    elif 'Ulysses' in link:
        lines = find_all_stripped('p', soup, RE_SUMM_3)
    elif 'pmKidnapped16' in link:
        find_all_stripped(['p', 'h4'], soup, RE_SUMM)[0].extract()
        lines = find_all_stripped(['p', 'h4'], soup, RE_CHAP)
    ###
    else:
        lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM) or find_all_stripped(['p', 'h4'], soup, RE_SUMM_2) or \
                find_all_stripped(['p', 'h4'], soup, RE_CHAP)
    lines = [
        x for x in lines
        if (x.find('b') and x.find('b').get_text(strip=True)) or x.name == 'h4'
    ]  # line should be bold
    if not lines or 'barrons/house' in link:
        lines.extend(find_all_stripped(['p', 'h4'], soup, RE_NUMDOT))
    if not lines:
        print('    cannot find section titles on', link)
        return []
    if 'pmFrankenstein10' in link:
        lines = lines[1:]
    frank_cond = 'pmFrankenstein' in link and not any(
        get_clean_text(lines[0]).startswith(x) for x in ('Summary', 'LETTER'))
    if 'barrons/heartdk' in link or frank_cond:
        lines = [lines[0].find_next('p')]
    for line in lines:
        if len(lines) > 1 or not title:
            title_ = line if not re.match(
                RE_SUMM, get_clean_text(line)) else line.find_previous('p')
            title_ = get_clean_text(title_)
        else:
            title_ = title
        if 'pmIdiot' in link or 'pmSecretSharer' in link:
            ps = line.find_all_next(['p', 'b'])
        elif 'wutherg' in link or 'Ulysses' in link:
            ps = []
            indiv_strs = []
            for sib in line.next_siblings:
                if sib.name == 'p':
                    if indiv_strs:
                        p = element.Tag(name='p')
                        p.string = ' '.join(indiv_strs)
                        ps.append(p)
                        indiv_strs = []
                    ps.append(sib)
                elif isinstance(sib, element.NavigableString) and \
                    not (sib.endswith("Barron's Booknotes\n") or sib.startswith("MonkeyNotes")):
                    indiv_strs.append(sib)
            if indiv_strs:
                p = element.Tag(name='p')
                p.string = ' '.join(indiv_strs)
                ps.append(p)
        else:
            ps = line.find_all_next(['p', 'h4'])
        paragraphs = process_paragraphs(ps)
        chapters.append((title_, paragraphs))
    if 'junglex' in link:  # this should be moved to manual_fix_individual()
        assert chapters[3][0] == 'CHAPTER 17'
        assert chapters[7][0] == 'CHAPTER 18'
        clean_scene = lambda x: re.sub('SCENE \d', '', x, 1)
        chapter17 = [
            *chapters[3][1],
            clean_scene(chapters[4][0]), *chapters[4][1],
            clean_scene(chapters[5][0]), *chapters[5][1],
            clean_scene(chapters[6][0])
        ]
        del chapters[6]
        del chapters[5]
        del chapters[4]
        chapters[3] = (chapters[3][0], chapter17)
    if get_next and chapters:  # check next page if is continued
        next_elem = soup.find('a', text=RE_NEXT)
        if not next_elem:
            pass
        else:
            next_link = urllib.parse.urljoin(link, next_elem['href'])
            chapters2 = process_story(next_link,
                                      get_next=get_next,
                                      find_continued=True)
            if not chapters2:
                pass
            elif len(chapters2) == 1:
                title1, paragraphs1 = chapters.pop(-1)
                title2, paragraphs2 = chapters2[0]
                chapters.append((title1, paragraphs1 + paragraphs2))
    return chapters

Ejemplo n.º 19

0

Mostrar archivo

Archivo: bookwolf_scrape.py Proyecto: manestay/novel-chapter-dataset

def get_summaries(title_url_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for title, url in title_url_map.items():  # iterate through books
        if title in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            orig_url = url
            url = get_archived(url, update_old)
        print('processing', title, url)
        author = ''  # TODO: figure this out
        soup = get_soup(url)
        contents = soup.find('table', id='Table56')
        if contents:
            idx = 3
        else:
            contents = soup.find('table', width='99%')
            idx = 4
        if not contents:
            print('table of contents not found on ', url)
            continue

        cells = contents.find('tbody').find_all(
            'tr', recursive=False)[idx].find_all('a')
        cells = [x for x in cells if num_in(get_clean_text(x))]
        if not cells:
            print('no chapters found for ', url)
            continue

        sects = []
        for c in cells:  # iterate through sections
            text = get_clean_text(c)
            if 'Interpretation' in text:
                continue
            href = c['href']
            link_summ = urllib.parse.urljoin(url, href)
            if archived:
                if '/' not in href:
                    orig_url = urllib.parse.urljoin(get_orig_url(url), href)
                else:
                    orig_url = get_orig_url(href)
                link_summ = get_archived(orig_url, update_old)
            paras = process_chapter(link_summ)
            if not paras:
                print('no summaries found on ', link_summ)
                continue
            text = standardize_section_titles(text)
            sects.append((text, paras))
        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=None,
                                source='bookwolf',
                                section_summaries=sects)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from bookwolf'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Ejemplo n.º 20

0

Mostrar archivo

def get_summaries(guides_page,
                  base_url,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  title_set=None,
                  sleep=SLEEP,
                  flatten=True):
    def add_summaries(url, section_summaries, flatten=True):
        # helper function
        summary_obj = get_section_summary(url, archived, update_old)
        multisect_title, sect_summs = summary_obj
        logging.info(multisect_title)
        if flatten:
            for sect_summ in sect_summs:
                sect_title, sect_paras = sect_summ
                if sect_title == 'Summary':
                    sect_title = multisect_title
                if re.match(RE_ANALYSIS, sect_title):
                    continue
                logging.info(sect_title)
                summary_obj_new = (sect_title, sect_paras)
                section_summaries.append(summary_obj_new)
        else:
            section_summaries.append(summary_obj)

    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    soup = get_soup(guides_page, sleep=sleep)
    title_url_map = {}
    for section in soup.findAll('section'):
        for book in section.findAll('h4'):
            title = book.a.text.strip()
            if title_set and title not in title_set:
                continue
            url = urllib.parse.urljoin(base_url, book.a['href'])
            title_url_map[title] = url

    print('found {} books'.format(len(title_url_map)))
    for i, (book, url) in enumerate(title_url_map.items()):
        if book in done:
            continue
        if archived:
            url = get_archived(url, update_old)
        print('processing {} {}'.format(book, url))
        soup = get_soup(url, sleep=sleep)
        author = get_author(soup)
        if not author:
            print('author not found, skipping', book, url)
            continue
        plot_url, section_urls = get_plot_section_urls(url, base_url, archived,
                                                       update_old)
        if plot_url:
            plot_overview = get_plot_summary(plot_url)
        else:
            plot_overview = None

        section_summaries = []
        for url in section_urls:
            add_summaries(url, section_summaries)
        if book == 'The Yellow Wallpaper':
            section_summaries = [('Book', plot_overview)]
        if not section_summaries:
            continue
        bs = BookSummary(
            title=book,
            author=author,
            genre=None,  # TODO: get genre from external source
            plot_overview=plot_overview,
            source='sparknotes',
            section_summaries=section_summaries)

        book_summaries.append(bs)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from sparknotes'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

Ejemplo n.º 21

0

Mostrar archivo

def get_section_summary(section_url,
                        archived=False,
                        update_old=False,
                        retry=0,
                        sleep=SLEEP):
    def _get_type(child):
        name = child.name if child.name in H3H4 or child.name == 'p' else None
        return name

    def _get_first(soup):
        summary = soup.find('div', {'class': 'studyGuideText'})
        page_elements = list(summary.children)

        def _increment_ind(
            ind,
            page_elements=page_elements,
        ):
            while ind < len(page_elements) and _get_type(
                    page_elements[ind]) is None:
                ind += 1
            return ind

        ind = _increment_ind(0)
        elem = page_elements[ind]

        paragraphs = []
        while _get_type(elem) == 'p':
            paragraphs.append(elem.text.strip().replace('\n', ' '))
            ind = _increment_ind(ind + 1)
            elem = page_elements[ind]
        return paragraphs, ind

    def _scrape_page(soup, ind=0):
        sub_section_summaries = []
        summary = soup.find('div', {'class': 'studyGuideText'})
        page_elements = list(summary.children)

        def _increment_ind(ind, page_elements=page_elements):
            while ind < len(page_elements) and _get_type(
                    page_elements[ind]) is None:
                ind += 1
            return ind

        # reached first subsection heading
        while ind < len(page_elements):
            ind = _increment_ind(ind)
            elem = page_elements[ind]
            el_type = _get_type(elem)
            assert el_type == 'h3' or el_type == 'h4'
            sub_section_name = elem.text.strip()

            ind = _increment_ind(ind + 1)
            elem = page_elements[ind]
            paragraphs = []
            while _get_type(elem) == 'p':
                paragraphs.append(elem.text.strip().replace('\n', ' '))
                ind = _increment_ind(ind + 1)
                if ind == len(page_elements):
                    break
                elem = page_elements[ind]

            sub_section_summaries.append((sub_section_name, paragraphs))

        return sub_section_summaries

    # scrape main page
    soup = get_soup(section_url, sleep=sleep)
    title_tag = soup.find(
        class_='interior-header__title__pagetitle') or soup.find('h2')
    ERRORS = set(['Something bad happened. Sorry.', 'read ECONNRESET'])
    is_error_page = not title_tag or title_tag.text in ERRORS
    if retry == 0 and is_error_page:
        return get_section_summary(section_url, archived, update_old, retry=1)
    # elif retry == 1 and is_error_page:
    #     archived_url = get_archived(section_url, update_old)
    #     print('WARNING: could not load page {} , trying archived version from {}'.format(section_url, archived_url))
    # return get_section_summary(archived_url, archived, update_old, retry=2)
    elif is_error_page:
        print('could not process {}'.format(section_url))
        os._exit(-1)
    section_name = title_tag.text.strip()
    studyguide = soup.find('div', {'class': 'studyGuideText'})
    if not studyguide.findAll(H3H4):
        paragraphs = _get_paragraphs(soup)
        summaries = [(section_name, paragraphs)]
    else:
        # skip any initial notes
        paragraphs, ind = _get_first(soup)
        summaries = _scrape_page(soup, ind)
    # scrape other pages, if any
    pagination = soup.find(class_='pagination-links') or \
                 soup.find(class_='interior-sticky-nav__navigation__list--short') or \
                 soup.find(class_='interior-sticky-nav__navigation')
    # # TODO: we can use below logic if sparknotes fixes www.sparknotes.com/lit/crime/section10/ ,
    # # which has chapters 1-4 on page 2, then chapter 5 on page 3
    # if summaries:
    #     at_analysis = re.match(RE_ANALYSIS, summaries[-1][0])
    # else:
    #     at_analysis = False
    # if not at_analysis and pagination is not None:
    if pagination is not None:
        pages = pagination.findAll('a')
        for page in pages[1:]:
            page_url = urllib.parse.urljoin(section_url, page['href'])
            if archived:
                orig_url = urllib.parse.urljoin(get_orig_url(section_url),
                                                page['href'])
                page_url = get_archived(orig_url, update_old)
                page_url = page_url.replace(
                    '/https://', '/', 1)  # avoid strange bug with archive.org
            soup = get_soup(page_url, sleep=sleep)
            studyguide = soup.find('div', {'class': 'studyGuideText'})
            if not studyguide:
                soup = get_soup(page_url, sleep=sleep)
                studyguide = soup.find('div', {'class': 'studyGuideText'})
            # if not studyguide:
            #     archived_url = get_archived(page_url)
            #     print('WARNING: study guide not found for {} , trying archived version from {}'.format(page_url, archived_url))
            #     soup = get_soup(archived_url, sleep=sleep)
            #     studyguide = soup.find('div', {'class': 'studyGuideText'})
            if studyguide and not studyguide.findAll(H3H4):
                # no sub-sections, so get all paragraphs and add to previous
                paragraphs = _get_paragraphs(soup)
                summaries[-1][1].extend(paragraphs)
            else:
                # get paragraphs before first subsection
                paragraphs, ind = _get_first(soup)
                summaries[-1][1].extend(paragraphs)
                page_summaries = _scrape_page(soup, ind=ind)
                summaries.extend(page_summaries)
    return section_name, summaries

Ejemplo n.º 22

0

Mostrar archivo

Archivo: novelguide_scrape.py Proyecto: manestay/novel-chapter-dataset

def process_story(link, title=None):
    link = link.replace('http://www.novelguide.com',
                        'https://www.novelguide.com', 1)
    chapters = []
    soup = get_soup(link, sleep=SLEEP)
    if 'mansfield-park/' in link or 'jude-the-obscure' in link:
        content = soup.find('div', class_='content clear-block')
        paras = content.find_all(['p', 'strong', 'div'])[2:]
    else:
        content = soup.find('div', id='content-content')
        paras = content.find_all('p')
    if link.endswith('the-adventures-of-tom-sawyer/novel-summary'):
        initial = paras[1].children.__next__()
        initial.insert_before(paras[0])
    sect_summ = []
    title = get_title(soup)
    break_found = False
    write = True
    if 'ivan-fyodorovich' in link:  # this page from The Brothers Karamazov is different from the others
        texts = [p.text for p in paras]
        summs = colon_section(texts, title)
        summs[9] = (summs[9][0], summs[9][1][:-7])
        chapters.extend(summs)
    else:
        for p in paras:
            text = get_clean_text(p, strip=False).strip()
            if not text or text.startswith('Log in'):
                continue
            br = p.find_all('br')
            if any(x in link for x in NONBOLD_WITH_SECTIONS):
                texts = list(p.stripped_strings)
                chapters.extend(other_section(texts, title))
            elif any(x in link for x in set([
                    'ulysses', 'siddhartha', 'awakening', 'brothers-karamazov',
                    'tess-of', 'the-ambass', 'jekyll', 'heart-of-darkness',
                    'winesburg'
            ])):
                texts = list(p.stripped_strings)
                chapters.extend(other_section(texts, title, always_write=True))
            elif any(x in link for x in set(['monte-cristo'])):
                texts = list(p.stripped_strings)
                chapters.extend(colon_section(texts, title))
            elif (len(br) > 3 or re.match(RE_CHAP_OPEN, p.get_text()) or any(x in link for x in BREAK_TITLES)) and \
                    'fathers-and-sons' not in link and 'hound' not in link:
                break_found = True
                chapters.extend(process_chapters(p, title))
                title = list(p.stripped_strings)[0]
            else:  # for sections where the text is in multiple <p> tags
                if text == 'advertisement' and not 'the-awakening' in link:
                    break
                elif text == 'advertisement':
                    continue
                bold = p if p.name == 'strong' else p.find(['b', 'strong'])
                if bold:
                    write = True
                    bold_text = bold.get_text(strip=True)
                    is_summ = re.match(RE_PLOT, bold_text)
                    if any(bold_text.startswith(x) for x in ANALYSIS):
                        write = False
                        if sect_summ:
                            chapters.append((title, sect_summ))
                            sect_summ = []
                        continue
                    elif not is_summ:
                        if sect_summ:
                            chapters.append((title, sect_summ))
                        title = bold_text if not is_summ else title
                        sect_summ = []
                    sibs = list(bold.next_siblings)
                    if write and sibs:
                        sibs = [x.strip() for x in sibs if isinstance(x, str)]
                        text = ' '.join(sibs).strip()
                        sect_summ.append(text)
                elif text == 'Analysis':
                    write = False
                    continue
                else:
                    if write:
                        sect_summ.append(text)
    if not break_found and sect_summ:
        chapters.append((title, sect_summ))

    for i, chapter in enumerate(chapters):
        norm = [unicodedata.normalize("NFKD", p).strip() for p in chapter[1]]
        norm = [x for x in norm if x]
        chapters[i] = (chapters[i][0], norm)
    return chapters

Ejemplo n.º 23

0

Mostrar archivo

def process_next_link(link, archived, update_old):
    soup = get_soup(link)

    chapters = find_all_stripped('a', soup, RE_CHAP)
    if 'pmEthanFrome' in link:
        chapters += soup.find_all('a', text=RE_OPEN)
    elif 'pmDubliners' in link:
        h3s = soup.find_all('h3')
        for h3 in h3s:
            if h3.text.startswith('Short Story'):
                chapters = h3.find_next_sibling('p').find_all('a')
    elif 'wutherg' in link:
        if chapters[-3]['href'] != 'wutherg47.asp':
            chapters[-3]['href'] = 'wutherg47.asp'
    elif 'pmJungle' in link:
        if chapters[3]['href'] != 'pmJungle20.asp':
            chapters[3]['href'] = 'pmJungle20.asp'
        if chapters[9]['href'] != 'pmJungle31.asp':
            chapters[9]['href'] = 'pmJungle31.asp'
    if not chapters:
        return None
    section_summs = []
    url_title_map = {}
    seen_urls = set()
    for c in chapters:
        href = c.get('href')
        title = get_clean_text(c)
        title = title if 'pmBabbitt' not in link else ''
        url = urllib.parse.urljoin(link, href)
        orig_url = url
        if 'dpbolvw' in url:
            continue
        dead_links1 = set(['pmVanity'])
        dead_links2 = set([
            'pmPrincePauper', 'pmIdiot', 'pmFatherSon', 'pmGreenwood',
            'pmOfHuman'
        ])
        dead_links3 = set(['pmDeerSlayer', 'pmTypee'])
        is_dead1 = any(x in orig_url for x in dead_links1)
        is_dead2 = any(x in orig_url for x in dead_links2)
        is_dead3 = any(x in orig_url for x in dead_links3)
        if is_dead1 or is_dead2 or is_dead3:
            # http://www.pinkmonkey.com:80/booknotes/monkeynotes/pmIdiot16.asp and up pages are dead
            # likewise for other strings
            page_no = int(re.findall('\d+', orig_url)[-1])
            if is_dead1 and page_no >= 17:
                continue
            elif is_dead2 and page_no >= 16:
                continue
            elif is_dead3 and page_no >= 13:
                continue
        if orig_url in seen_urls:
            continue
        if archived:
            orig_url = urllib.parse.urljoin(get_orig_url(link), c.get('href'))
            url = get_archived(orig_url, update_old)
        url_title_map[url] = title
        seen_urls.add(orig_url)

    for url, title in url_title_map.items():
        summs = process_story(url, title)
        for summ in summs:
            # print(' ', summ[0])
            if summ[1]:  # not empty text
                section_summs.append(summ)

    # manual fixes
    extra_sections = []
    if 'pmWinesburg' in link:
        extra_sections = [
            "pmWinesburg20.asp", "pmWinesburg21.asp", "pmWinesburg22.asp"
        ]
    elif 'pmDubliners' in link:
        extra_sections = [
            "pmDubliners12.asp", "pmDubliners16.asp"
        ]  # pmDubliners57.asp has no "Summary" heading, so skip
    if extra_sections:
        if archived:
            links_addtl = [
                get_archived(urllib.parse.urljoin(get_orig_url(link), href),
                             update_old) for href in extra_sections
            ]
        else:
            links_addtl = [
                urllib.parse.urljoin(link, x) for x in extra_sections
            ]
        sect_summs_addtl = [process_story(x) for x in links_addtl]
        sect_summs_addtl = [x[0] for x in sect_summs_addtl]
        section_summs.extend(sect_summs_addtl)
    return section_summs

Ejemplo n.º 24

0

Mostrar archivo

def get_summaries(page_title_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([(x.title, x.source) for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for page, title in page_title_map.items():
        if 'barrons' in page.lower():
            source = 'barrons'
        elif 'monkeynotes' in page.lower():
            source = 'monkeynotes'
        if (title, source) in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            page = get_archived(page, update_old)
        print('processing', title, page)
        author = ''  # TODO: figure this out

        soup_book = get_soup(page)

        next_link = soup_book.find('a', text=RE_NEXT)
        story_link = soup_book.find('a', text=RE_STORY)
        is_hard_times = 'pinkmonkey.com/booknotes/barrons/hardtms.asp' in page

        if not (next_link or story_link or is_hard_times):
            print('cannot find any summaries for ', page)
            continue
        if is_paywall(page):
            print('    page is under a paywall, will be more errors: ', page)
            # continue

        if next_link:  # monkeynotes
            href = next_link.get('href')
            url = urllib.parse.urljoin(get_orig_url(page), href)
            url = get_archived(url, update_old)
            sect_summs = process_next_link(url, archived, update_old)
        elif story_link:  # barrons (most)
            url = page
            sect_summs = process_story_link(url, archived, update_old)
        elif is_hard_times:
            url = page
            sect_summs = process_next_link(url, archived, update_old)
        else:
            print('error')
            sys.exit()

        if not sect_summs:
            print('    Cannot process {}'.format(url))
            # NOTE: expected to reach here for barrons Oliver Twist and barrons The Secret Sharer
            continue

        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=None,
                                source=source,
                                section_summaries=sect_summs)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from pinkmonkey'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries