Python get_clean_textの例、scrape_lib.get_clean_text Pythonの例

コード例 #1

0

ファイルを表示

def is_paywall(link):
    soup = get_soup(link)
    for p in soup.find_all('p'):
        p_text = get_clean_text(p)
        if p_text.startswith('NOTICE: Unfortunately'):
            return True
    return False

コード例 #2

0

ファイルを表示

def get_pages_titles(index_pages, books_list, title_set=None):
    book_pages = []
    titles = []
    for page_link in sorted(index_pages):  # get book pages
        soup_page = get_soup(page_link)
        book_links = soup_page.find_all(
            'p')  # 1 p element has 1 or more book listings
        for b in book_links:
            for elem in b.find_all('a'):
                href = elem.get('href')
                basename = href.rsplit('/', 1)[-1]
                if href.endswith(
                        '.asp'
                ) and 'notes' not in basename and 'first.asp' != basename:
                    title = get_clean_text(elem)
                    title = title.replace('Downloadable/Printable Version', '')
                    title = title.replace('&nbsp', '')
                    if not title or title == 'Quotes' or title == 'Quotations' or title.startswith(
                            'Read the'):
                        continue
                    if title_set and title not in title_set:
                        continue
                    book_pages.append(href)
                    titles.append(title)
    book_pages = get_absolute_links(book_pages, books_list)
    return book_pages, titles

コード例 #3

0

ファイルを表示

def process_paragraphs(ps):
    paragraphs = []
    for p in ps:
        if not p:
            continue
        para = get_clean_text(p, strip=False).strip()
        if para == 'Notes':
            break
        if not para or any([para.startswith(x)
                            for x in BAD_STARTS]) or p.name == 'b':
            continue
        a = p.find('a')
        if a and a.get('href'):
            continue
        if p.find('i'):
            i_text = p.find('i').get_text(strip=True)
            if len(i_text) >= .9 * len(para):
                continue

        b_text = None
        b = p.find('b')
        if b:
            b_text = b.get_text(strip=True)
            if b_text == ', ':
                b_text = None
            elif b_text == 'Om':  # fix for http://www.pinkmonkey.com/booknotes/monkeynotes/pmSiddhartha20.asp
                para = para.replace('Om', ' Om')
                b_text = None
            else:
                b_text = b_text
        if p.name == 'h4' or b_text:
            break  # reached another section's paragraphs
        para = para.replace('', "'")  # replace weird apostrophe
        paragraphs.append(para)
    return paragraphs

コード例 #4

0

ファイルを表示

def get_author(soup):
    # written_by = soup.find(class_='subnav__writtenby')
    # try:
    #     return written_by.find('a').text.strip()
    # except AttributeError as e:
    #     print(e, 'in get_author')
    #     return ''
    written_by = soup.find(class_='TitleHeader_authorLink') or soup.find(
        class_='TitleHeader_authorName')
    return get_clean_text(written_by)

コード例 #5

0

ファイルを表示

ファイル: novelguide_scrape.py プロジェクト: manestay/novel-chapter-dataset

def get_title_url_map(books_list, title_set=None):
    soup = get_soup(books_list, sleep=SLEEP)
    # book_links = soup.find('table', class_='views-table cols-2').find_all('a')
    book_links = soup.find('table', class_='cols-2').find_all('a')
    title_url_map = {}
    for link in book_links:
        title = get_clean_text(link).replace(' Study Guide', '')
        if title_set and title not in title_set:
            continue
        link = link.get('href')
        title_url_map[title] = urllib.parse.urljoin(books_list, link)
    return title_url_map

コード例 #6

0

ファイルを表示

ファイル: bookwolf_scrape.py プロジェクト: manestay/novel-chapter-dataset

def get_title_url_map(books_list, title_set=None):
    soup = get_soup(books_list)
    columns = soup.find_all('table', width=None)[1].find_all('table')

    title_url_map = {}
    for column in columns:
        cells = column.find_all('tr')
        for cell in cells:
            p = cell.find('p')
            entries = p.find_all('a')
            for entry in entries:
                title = get_clean_text(entry)
                if title_set and title not in title_set:
                    continue
                href = entry.get('href')
                title_url_map[title] = urllib.parse.urljoin(books_list, href)
    return title_url_map

コード例 #7

0

ファイルを表示

ファイル: bookwolf_scrape.py プロジェクト: manestay/novel-chapter-dataset

def process_paragraphs(ps):
    paragraphs = []
    for p in ps:
        if not p:
            continue
        para = get_clean_text(p)
        if para == 'Interpretation':
            break
        if not para:
            continue


#         if p.find('i') and p.find('i').get_text(strip=True):
#             continue
        if p.find('b') and p.find('b').get_text(strip=True):
            break  # reached another section's paragraphs
        paragraphs.append(para)
    return paragraphs

コード例 #8

0

ファイルを表示

ファイル: novelguide_scrape.py プロジェクト: manestay/novel-chapter-dataset

def process_plot(link):
    plot_summ = []
    soup = get_soup(link, sleep=SLEEP)
    content = soup.find('div', id='content-content')
    paras = content.find_all('p')
    for p in paras:
        text = get_clean_text(p, strip=False)
        bold = p.find(['b', 'strong'])
        if bold:
            if bold.get_text() == 'Analysis':
                break
            sibs = list(bold.next_siblings)
            if sibs:
                text = str(sibs[-1])
            else:
                continue
        if p and not text.startswith('Log in'):
            plot_summ.append(text)
    return plot_summ

コード例 #9

0

ファイルを表示

ファイル: bookwolf_scrape.py プロジェクト: manestay/novel-chapter-dataset

def get_summaries(title_url_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for title, url in title_url_map.items():  # iterate through books
        if title in done:
            continue
        if sleep:
            time.sleep(sleep)
        if archived:
            orig_url = url
            url = get_archived(url, update_old)
        print('processing', title, url)
        author = ''  # TODO: figure this out
        soup = get_soup(url)
        contents = soup.find('table', id='Table56')
        if contents:
            idx = 3
        else:
            contents = soup.find('table', width='99%')
            idx = 4
        if not contents:
            print('table of contents not found on ', url)
            continue

        cells = contents.find('tbody').find_all(
            'tr', recursive=False)[idx].find_all('a')
        cells = [x for x in cells if num_in(get_clean_text(x))]
        if not cells:
            print('no chapters found for ', url)
            continue

        sects = []
        for c in cells:  # iterate through sections
            text = get_clean_text(c)
            if 'Interpretation' in text:
                continue
            href = c['href']
            link_summ = urllib.parse.urljoin(url, href)
            if archived:
                if '/' not in href:
                    orig_url = urllib.parse.urljoin(get_orig_url(url), href)
                else:
                    orig_url = get_orig_url(href)
                link_summ = get_archived(orig_url, update_old)
            paras = process_chapter(link_summ)
            if not paras:
                print('no summaries found on ', link_summ)
                continue
            text = standardize_section_titles(text)
            sects.append((text, paras))
        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=None,
                                source='bookwolf',
                                section_summaries=sects)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from bookwolf'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

コード例 #10

0

ファイルを表示

def process_next_link(link, archived, update_old):
    soup = get_soup(link)

    chapters = find_all_stripped('a', soup, RE_CHAP)
    if 'pmEthanFrome' in link:
        chapters += soup.find_all('a', text=RE_OPEN)
    elif 'pmDubliners' in link:
        h3s = soup.find_all('h3')
        for h3 in h3s:
            if h3.text.startswith('Short Story'):
                chapters = h3.find_next_sibling('p').find_all('a')
    elif 'wutherg' in link:
        if chapters[-3]['href'] != 'wutherg47.asp':
            chapters[-3]['href'] = 'wutherg47.asp'
    elif 'pmJungle' in link:
        if chapters[3]['href'] != 'pmJungle20.asp':
            chapters[3]['href'] = 'pmJungle20.asp'
        if chapters[9]['href'] != 'pmJungle31.asp':
            chapters[9]['href'] = 'pmJungle31.asp'
    if not chapters:
        return None
    section_summs = []
    url_title_map = {}
    seen_urls = set()
    for c in chapters:
        href = c.get('href')
        title = get_clean_text(c)
        title = title if 'pmBabbitt' not in link else ''
        url = urllib.parse.urljoin(link, href)
        orig_url = url
        if 'dpbolvw' in url:
            continue
        dead_links1 = set(['pmVanity'])
        dead_links2 = set([
            'pmPrincePauper', 'pmIdiot', 'pmFatherSon', 'pmGreenwood',
            'pmOfHuman'
        ])
        dead_links3 = set(['pmDeerSlayer', 'pmTypee'])
        is_dead1 = any(x in orig_url for x in dead_links1)
        is_dead2 = any(x in orig_url for x in dead_links2)
        is_dead3 = any(x in orig_url for x in dead_links3)
        if is_dead1 or is_dead2 or is_dead3:
            # http://www.pinkmonkey.com:80/booknotes/monkeynotes/pmIdiot16.asp and up pages are dead
            # likewise for other strings
            page_no = int(re.findall('\d+', orig_url)[-1])
            if is_dead1 and page_no >= 17:
                continue
            elif is_dead2 and page_no >= 16:
                continue
            elif is_dead3 and page_no >= 13:
                continue
        if orig_url in seen_urls:
            continue
        if archived:
            orig_url = urllib.parse.urljoin(get_orig_url(link), c.get('href'))
            url = get_archived(orig_url, update_old)
        url_title_map[url] = title
        seen_urls.add(orig_url)

    for url, title in url_title_map.items():
        summs = process_story(url, title)
        for summ in summs:
            # print(' ', summ[0])
            if summ[1]:  # not empty text
                section_summs.append(summ)

    # manual fixes
    extra_sections = []
    if 'pmWinesburg' in link:
        extra_sections = [
            "pmWinesburg20.asp", "pmWinesburg21.asp", "pmWinesburg22.asp"
        ]
    elif 'pmDubliners' in link:
        extra_sections = [
            "pmDubliners12.asp", "pmDubliners16.asp"
        ]  # pmDubliners57.asp has no "Summary" heading, so skip
    if extra_sections:
        if archived:
            links_addtl = [
                get_archived(urllib.parse.urljoin(get_orig_url(link), href),
                             update_old) for href in extra_sections
            ]
        else:
            links_addtl = [
                urllib.parse.urljoin(link, x) for x in extra_sections
            ]
        sect_summs_addtl = [process_story(x) for x in links_addtl]
        sect_summs_addtl = [x[0] for x in sect_summs_addtl]
        section_summs.extend(sect_summs_addtl)
    return section_summs

コード例 #11

0

ファイルを表示

def process_story(link, title=None, get_next=True, find_continued=False):
    """
    returns tuples of (title, summary list) format
    """
    soup = get_soup(link)
    chapters = []
    if find_continued:
        lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM_CONTINUED)
        if not lines:
            return []
    ### specific edge cases
    elif 'WhiteFang' in link:
        lines = find_all_stripped(
            ['p', 'h4'], soup, RE_CHAP) + find_all_stripped(['p', 'h4'], soup,
                                                            RE_SUMM)
    elif 'Ulysses' in link:
        lines = find_all_stripped('p', soup, RE_SUMM_3)
    elif 'pmKidnapped16' in link:
        find_all_stripped(['p', 'h4'], soup, RE_SUMM)[0].extract()
        lines = find_all_stripped(['p', 'h4'], soup, RE_CHAP)
    ###
    else:
        lines = find_all_stripped(['p', 'h4'], soup, RE_SUMM) or find_all_stripped(['p', 'h4'], soup, RE_SUMM_2) or \
                find_all_stripped(['p', 'h4'], soup, RE_CHAP)
    lines = [
        x for x in lines
        if (x.find('b') and x.find('b').get_text(strip=True)) or x.name == 'h4'
    ]  # line should be bold
    if not lines or 'barrons/house' in link:
        lines.extend(find_all_stripped(['p', 'h4'], soup, RE_NUMDOT))
    if not lines:
        print('    cannot find section titles on', link)
        return []
    if 'pmFrankenstein10' in link:
        lines = lines[1:]
    frank_cond = 'pmFrankenstein' in link and not any(
        get_clean_text(lines[0]).startswith(x) for x in ('Summary', 'LETTER'))
    if 'barrons/heartdk' in link or frank_cond:
        lines = [lines[0].find_next('p')]
    for line in lines:
        if len(lines) > 1 or not title:
            title_ = line if not re.match(
                RE_SUMM, get_clean_text(line)) else line.find_previous('p')
            title_ = get_clean_text(title_)
        else:
            title_ = title
        if 'pmIdiot' in link or 'pmSecretSharer' in link:
            ps = line.find_all_next(['p', 'b'])
        elif 'wutherg' in link or 'Ulysses' in link:
            ps = []
            indiv_strs = []
            for sib in line.next_siblings:
                if sib.name == 'p':
                    if indiv_strs:
                        p = element.Tag(name='p')
                        p.string = ' '.join(indiv_strs)
                        ps.append(p)
                        indiv_strs = []
                    ps.append(sib)
                elif isinstance(sib, element.NavigableString) and \
                    not (sib.endswith("Barron's Booknotes\n") or sib.startswith("MonkeyNotes")):
                    indiv_strs.append(sib)
            if indiv_strs:
                p = element.Tag(name='p')
                p.string = ' '.join(indiv_strs)
                ps.append(p)
        else:
            ps = line.find_all_next(['p', 'h4'])
        paragraphs = process_paragraphs(ps)
        chapters.append((title_, paragraphs))
    if 'junglex' in link:  # this should be moved to manual_fix_individual()
        assert chapters[3][0] == 'CHAPTER 17'
        assert chapters[7][0] == 'CHAPTER 18'
        clean_scene = lambda x: re.sub('SCENE \d', '', x, 1)
        chapter17 = [
            *chapters[3][1],
            clean_scene(chapters[4][0]), *chapters[4][1],
            clean_scene(chapters[5][0]), *chapters[5][1],
            clean_scene(chapters[6][0])
        ]
        del chapters[6]
        del chapters[5]
        del chapters[4]
        chapters[3] = (chapters[3][0], chapter17)
    if get_next and chapters:  # check next page if is continued
        next_elem = soup.find('a', text=RE_NEXT)
        if not next_elem:
            pass
        else:
            next_link = urllib.parse.urljoin(link, next_elem['href'])
            chapters2 = process_story(next_link,
                                      get_next=get_next,
                                      find_continued=True)
            if not chapters2:
                pass
            elif len(chapters2) == 1:
                title1, paragraphs1 = chapters.pop(-1)
                title2, paragraphs2 = chapters2[0]
                chapters.append((title1, paragraphs1 + paragraphs2))
    return chapters

コード例 #12

0

ファイルを表示

ファイル: novelguide_scrape.py プロジェクト: manestay/novel-chapter-dataset

def get_summaries(title_url_map,
                  out_name,
                  use_pickled=False,
                  archived=False,
                  update_old=False,
                  save_every=5,
                  sleep=0):
    if use_pickled and os.path.exists(out_name):
        with open(out_name, 'rb') as f1:
            book_summaries = pickle.load(f1)
        print('loaded {} existing summaries, resuming'.format(
            len(book_summaries)))
        done = set([x.title for x in book_summaries])
    else:
        book_summaries = []
        done = set()

    for title, url in title_url_map.items():
        title = title.replace("DeerSlayer", 'Deerslayer', 1)
        if title in done:
            continue
        if sleep:
            time.sleep(sleep)
        author = ''  # TODO: figure this out
        archived_local = archived
        if archived:
            orig_url = url
            url = get_archived(url, update_old)
        print('processing', title, url)
        soup = get_soup(url, sleep=SLEEP)
        table = soup.find('div', id='block-booknavigation-3') or soup.find(
            'div', id='block-block-4')

        # process plot summary
        plot_summ = None
        plot_cell = table.find('a', href=RE_PLOT_LINK)
        if plot_cell:
            plot_title = plot_cell.get_text()
            href = plot_cell['href']
            if archived:
                plot_link = get_orig_url(href)
                plot_link = get_archived(plot_link, update_old)
                if 'archive.org' not in plot_link:  # failed to retrieve archived version
                    # archived versions of 'the-mayor-of-casterbridge' seem to be corrupted
                    time.sleep(5.0)
                    archived_local = False
            else:
                plot_link = urllib.parse.urljoin(url, href)
            if 'Chapter' not in plot_title:
                plot_summ = process_plot(plot_link)
            if not plot_summ:
                print('  no plot summary found', plot_link)

        # process section summaries
        cells = table.find_all('a', href=RE_SUMM_LINK)
        if title == "The Brothers Karamazov":
            cells = sort_cells(cells)
        section_summs = []

        if not cells:
            print('  no section links found for', url)
            continue

        seen_sects = set()
        for c in cells:
            section_title = get_clean_text(c)
            section_title_chap = section_title.rsplit(':', 1)[-1]
            if section_title_chap in seen_sects:
                print('  seen {} already, skipped'.format(section_title_chap))
                continue
            if re.match(RE_PLOT, section_title):
                continue

            if archived and archived_local:
                link_summ = get_orig_url(c['href'])
                link_summ = get_archived(link_summ, update_old)
            else:
                link_summ = urllib.parse.urljoin(url, c['href'])

            try:
                page_summs = process_story(link_summ)
            except AttributeError:  # page failed to load, try again
                print('  retrying after 5 seconds...')
                time.sleep(5.0)
                page_summs = process_story(link_summ)

            if page_summs:
                section_summs.extend(page_summs)
                seen_sects.add(section_title_chap)
        if not section_summs:
            print('  could not find summaries for {}'.format(title))
            continue
        book_summ = BookSummary(title=title,
                                author=author,
                                genre=None,
                                plot_overview=plot_summ,
                                source='novelguide',
                                section_summaries=section_summs)
        book_summaries.append(book_summ)
        num_books = len(book_summaries)
        if num_books > 1 and num_books % save_every == 0:
            with open(out_name, 'wb') as f:
                pickle.dump(book_summaries, f)
            print("Done scraping {} books".format(num_books))

    print('Scraped {} books from novelguide'.format(len(book_summaries)))
    with open(out_name, 'wb') as f:
        pickle.dump(book_summaries, f)
    print('wrote to', out_name)
    return book_summaries

コード例 #13

0

ファイルを表示

ファイル: novelguide_scrape.py プロジェクト: manestay/novel-chapter-dataset

def process_story(link, title=None):
    link = link.replace('http://www.novelguide.com',
                        'https://www.novelguide.com', 1)
    chapters = []
    soup = get_soup(link, sleep=SLEEP)
    if 'mansfield-park/' in link or 'jude-the-obscure' in link:
        content = soup.find('div', class_='content clear-block')
        paras = content.find_all(['p', 'strong', 'div'])[2:]
    else:
        content = soup.find('div', id='content-content')
        paras = content.find_all('p')
    if link.endswith('the-adventures-of-tom-sawyer/novel-summary'):
        initial = paras[1].children.__next__()
        initial.insert_before(paras[0])
    sect_summ = []
    title = get_title(soup)
    break_found = False
    write = True
    if 'ivan-fyodorovich' in link:  # this page from The Brothers Karamazov is different from the others
        texts = [p.text for p in paras]
        summs = colon_section(texts, title)
        summs[9] = (summs[9][0], summs[9][1][:-7])
        chapters.extend(summs)
    else:
        for p in paras:
            text = get_clean_text(p, strip=False).strip()
            if not text or text.startswith('Log in'):
                continue
            br = p.find_all('br')
            if any(x in link for x in NONBOLD_WITH_SECTIONS):
                texts = list(p.stripped_strings)
                chapters.extend(other_section(texts, title))
            elif any(x in link for x in set([
                    'ulysses', 'siddhartha', 'awakening', 'brothers-karamazov',
                    'tess-of', 'the-ambass', 'jekyll', 'heart-of-darkness',
                    'winesburg'
            ])):
                texts = list(p.stripped_strings)
                chapters.extend(other_section(texts, title, always_write=True))
            elif any(x in link for x in set(['monte-cristo'])):
                texts = list(p.stripped_strings)
                chapters.extend(colon_section(texts, title))
            elif (len(br) > 3 or re.match(RE_CHAP_OPEN, p.get_text()) or any(x in link for x in BREAK_TITLES)) and \
                    'fathers-and-sons' not in link and 'hound' not in link:
                break_found = True
                chapters.extend(process_chapters(p, title))
                title = list(p.stripped_strings)[0]
            else:  # for sections where the text is in multiple <p> tags
                if text == 'advertisement' and not 'the-awakening' in link:
                    break
                elif text == 'advertisement':
                    continue
                bold = p if p.name == 'strong' else p.find(['b', 'strong'])
                if bold:
                    write = True
                    bold_text = bold.get_text(strip=True)
                    is_summ = re.match(RE_PLOT, bold_text)
                    if any(bold_text.startswith(x) for x in ANALYSIS):
                        write = False
                        if sect_summ:
                            chapters.append((title, sect_summ))
                            sect_summ = []
                        continue
                    elif not is_summ:
                        if sect_summ:
                            chapters.append((title, sect_summ))
                        title = bold_text if not is_summ else title
                        sect_summ = []
                    sibs = list(bold.next_siblings)
                    if write and sibs:
                        sibs = [x.strip() for x in sibs if isinstance(x, str)]
                        text = ' '.join(sibs).strip()
                        sect_summ.append(text)
                elif text == 'Analysis':
                    write = False
                    continue
                else:
                    if write:
                        sect_summ.append(text)
    if not break_found and sect_summ:
        chapters.append((title, sect_summ))

    for i, chapter in enumerate(chapters):
        norm = [unicodedata.normalize("NFKD", p).strip() for p in chapter[1]]
        norm = [x for x in norm if x]
        chapters[i] = (chapters[i][0], norm)
    return chapters