Python get_soup Beispiele, gsweb.get_soup Python Beispiele

Beispiel #1

0

Datei anzeigen

def get_chapter(url):
    print("Processing: " + url)
    html = gsweb.get_soup(url)
    if 'wuxiaworld.com' in url:
        chapter_title, contents = get_wuxiaworld_com(html)
    elif 'wuxiaworld.co' in url:
        chapter_title, contents = get_wuxiaworld_co(html)
    elif 'stabbingwithasyringe' in url:
        chapter_title, contents = get_syringe(html)
    else:
        raise SystemExit('Something went wrong! Unsuported server!')
    # Novel dependant cleanup
    try:
        print('Cleaning...')
        novel = __import__(novel_module)
        contents = novel.clean(contents)
        print('Clean')
    except ImportError:
        pass
    soup_str = "".join(map(str, contents))
    # Before turning the html into a soup, replace all weird chinese spaces
    # with actual spaces.
    soup_str = soup_str.replace('　', ' ')
    # And replace double br tags with a paragraph break
    soup_str = re.sub(r'</br>', '', soup_str)
    soup_str = re.sub(r'<br/>[\t\n\r\f\v\s　]*<br/>', '\n<p>', soup_str)
    soup_str = re.sub(r'<br/>', '</p>\n<p>', soup_str)

    print(chapter_title)
    chapter_file = clean_chapter_name(chapter_title)
    print(chapter_file)
    # Then turn the string back into a soup
    soup_text = BeautifulSoup(soup_str, 'lxml')
    # Remove all atributes from all tags
    for tag in soup_text.findAll(True):
        tag.attrs = {}
    # Remove empty paragraphs, including those which only contain br tags or
    # the weird space character (why the &·$% do you have a paragraph with
    # nothing?)
    for paragraph in soup_text.findAll(['span', 'p']):
        if not paragraph.text or paragraph.text in [' ', '。']:
            paragraph.decompose()
    # Remove stray br tags
    # for br_tag in soup_text.findAll('br'):
    #     br_tag.decompose()
    # Turn the soup into text
    # text = str(soup_text)
    text = soup_text.prettify()

    # Undo some ridiculous censoring
    # text = damnit.sub('damn it', text)
    # text = damned.sub('damned', text)
    # text = f**k.sub('f**k', text)

    return chapter_title, chapter_file, text

Beispiel #2

0

Datei anzeigen

def get_chapter(url):
    global chapterCount
    chapterCount = chapterCount + 1
    pagehtml = gsweb.get_soup(url)
    print("Current url: " + url)
    pages_re = re.compile('"pages":([0-9]*),', re.IGNORECASE)
    pages = int(pages_re.search(str(pagehtml)).group(1))
    print("Pages in this chapter: {}".format(pages))
    text = []
    chaptertitle = pagehtml.select('h1.h2')[0].get_text().strip()
    chapterfile = "{}.xhtml".format(
        chaptertitle.replace(" ", "-") + "-" + str(chapterCount))
    text.append("<h2>{}</h2>\n".format(chaptertitle))
    for i in range(1, pages + 1):
        page_url = url + "/page/" + str(i)
        print("Working on: " + page_url)
        text.append('<div class="page">\n')
        for j in get_page(page_url):
            text.append(j.prettify())
        text.append('</div>\n')
    chapter = "".join(text)
    return chaptertitle, chapterfile, chapter

Beispiel #3

0

Datei anzeigen

def genlist(start, end):
    global origin
    list_page = gsweb.get_soup(origin)
    chapterlist = []
    for i in range(start, end + 1):
        # print(i)
        if i in [29, 115, 342, 825, 1183, 1794]:
            continue
        elif i in range(1, 572 + 1):
            text = '^Chapter %s .*' % str(i)
            if i == 370:
                text = '^Chapter %s$' % str(i)
            elif i in [351, 353, 354]:
                text = '^Chapter %s - ' % str(i)
        elif i in [
                573,
        ]:
            text = '^AST: Chapter %s .*' % str(i)
        elif i in [584, 585, 586, 587, 588, 589, 605, 616]:
            text = '^AST: Chapter %s!$' % str(i)
        elif i in [
                590,
                800,
        ]:
            text = '^chapter %s$' % str(i)
        elif i in [596, 598, 799, 1416] + list(range(1440, 2492 + 1)):
            text = '^AST %s ' % str(i)
            if i in [1797, 1957, 2281]:
                text = '^AST %s- ' % str(i)
            elif i == 2345:
                text = '^AST 2345 - Fifth .*'
            elif i == 2435:
                text = '^AST 2345 - Tyrannous .*'
            elif i in [2468, 2473]:
                text = '^Chapter %s - ' % str(i)
        elif i in [597, 600, 603, 606, 609, 610, 611, 613, 614, 615, 617, 619
                   ] + list(range(591, 595 + 1)) + list(range(623, 626 + 1)):
            text = '^Chapter %s$' % str(i)
        elif i in [
                599,
                601,
                602,
                604,
                607,
                608,
                621,
                668,
                670,
                671,
                672,
                675,
                676,
                677,
                679,
                681,
                682,
                684,
                685,
                686,
                687,
                689,
                691,
        ]:
            text = '^Chapter %s!' % str(i)
        elif i in [
                612,
        ]:
            text = '^Chapter %s .*' % str(i)
        elif i in [618, 620, 622, 627, 631, 633, 635, 639, 642, 645, 648, 650]:
            text = '^AST Chapter: %s!' % str(i)
        elif i in [654, 658, 661, 663, 666, 669, 674, 678, 680, 683, 688, 690,
                   693, ] + list(range(697, 798+1)) + \
                list(range(801, 1415+1)) + list(range(1417, 1436+1)):
            text = '^Chapter: %s$' % str(i)
            if i == 1184:
                text = '^1184$'
        elif i == 2493:
            text = '^Author.*'
        else:
            text = '^Chapter %s$' % str(i)
        link = list_page.find('a', text=re.compile(text))
        url = origin + link['href'].split('/')[-1]
        chapterlist.append(url)
    return chapterlist

Beispiel #4

0

Datei anzeigen

def get_page(text_url):
    text = gsweb.get_soup(text_url).select_one('pre').findChildren()
    return text

Beispiel #5

0

Datei anzeigen

def get_book(initial_url):
    base_url = 'http://www.wattpad.com'
    html = gsweb.get_soup(initial_url)

    # Get basic book information
    author = html.select('div.author-info__username')[0].get_text()
    title = html.select('div.story-info__title')[0].get_text().strip()
    description = html.select('pre.description-text')[0].get_text()
    coverurl = html.select('div.story-cover img')[0]['src']
    labels = ['Wattpad']
    for label in html.select('div.tags a'):
        if '/' in label['href']:
            labels.append(label.get_text())
    if debug:
        print("Author: " + author)
        print("Title: " + title)
        print("Description: " + description)
        print("Cover: " + coverurl)
        print("Labels:" + " ".join(labels))

    print("'{}' by {}".format(title, author).encode("utf-8"))
    # print(next_page_url)

    # Get list of chapters
    chapterlist = list(dict.fromkeys(html.select('.story-parts ul li a')))

    # Remove from the file name those characters that Microsoft does NOT allow.
    # This also affects the FAT filesystem used on most phone/tablet sdcards
    # and other devices used to read epub files.
    # Disallowed characters: \/:*?"<>|^
    filename = title
    for i in ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '^']:
        if i in filename:
            filename = filename.replace(i, '')
    # Apple products disallow files starting with dot
    filename = filename.lstrip('.')

    epubfile = "{} - {}.epub".format(filename, author)
    if not os.path.exists(epubfile):
        identifier = "wattpad.com//%s/%s" % (initial_url.split('/')[-1],
                                             len(chapterlist))
        LANGUAGE = 'en'
        book = MyBook(identifier, title, LANGUAGE, 'wattpad2epub')
        book.add_author(author)
        # Add all labels.
        book.add_labels(labels)
        # Add a cover if it's available
        cover_file = 'cover.jpg'
        if get_cover(coverurl, cover_file):
            book.add_cover(cover_file)
            os.remove(cover_file)

        # Define CSS style
        with open(os.path.join(PROG_DIR, "CSS", "nav.css")) as style_nav:
            book.add_nav_style(style_nav.read())
        with open(os.path.join(PROG_DIR, "CSS", "body.css")) as style_body:
            book.add_body_style(style_body.read())

        # Introduction
        book.add_intro(author, initial_url, description,
                       os.path.join(PROG_DIR, "HTML", "intro.xhtml"))

        for item in chapterlist:
            chaptertitle = item.get_text().strip().replace("/", "-")
            if chaptertitle.upper() != "A-N":
                print("Working on: {}".format(chaptertitle).encode("utf-8"))
                ch_title, ch_file, ch_text = get_chapter("{}{}".format(
                    base_url, item['href']))
                book.add_chapter(chaptertitle, ch_file, LANGUAGE, ch_text)

        # Define Table of Contents, NCX, Nav and book spine
        book.finalize()

        # Write the epub to file
        book.write(epubfile)

    else:
        print("Epub file already exists, not updating")

Beispiel #6

0

Datei anzeigen

def get_html(url):
    return gsweb.get_soup(url)