Ejemplo n.º 1
0
def fetch_contents(book, soup):
    host = host_of(soup)
    div = find_tag(soup, "div", "list_box")
    for a in div.find_all("a"):
        text = yem.Text.for_html(host + a["href"], fetch_text)
        book.append(yem.Chapter(text=text, title=a.string.strip()))


def fetch_text(url):
    try:
        soup = fetch_html(url, encoding=ENCODING)
        div = find_tag(soup, "div", "box_box")
        lines = []
        for tag in div:
            if tag.name is None:
                s = tag.string.strip()
                if len(s) != 0:
                    lines.append(s)
        lines.pop()
        return "\n".join(lines)
    except:
        return ""


if __name__ == "__main__":
    url = "http://234zw.com/xingjiqiyuan/"
    book = yem.Book()
    soup = fetch_attributes(book, url)
    fetch_contents(book, soup)
    yem.make_book(book, r"E:\tmp")
Ejemplo n.º 2
0
def fetch_contents(book, soup):
    host = host_of(soup)
    for dd in soup.find_all('dd'):
        a = dd.next
        chapter = yem.Chapter(title=re.sub(r'\s[\d]{2}-[\d]{2}', '', a.string.strip()))
        chapter.text = yem.Text.for_html(host + a['href'], fetch_text, tag=chapter)
        book.append(chapter)


def fetch_text(url, chapter):
    try:
        print('fetching text:', chapter.title)
        soup = fetch_html(url, encoding=ENCODING)
        if soup is None:
            app_error('cannot open url: {0}', url)
            return ''
        return yem.LINE_SEPARATOR.join(soup.find('div', id='content').stripped_strings)
    except:
        return ''


if __name__ == "__main__":
    url = "http://www.mangg.com/id28111/"
    book = yem.Book()
    soup = fetch_attributes(book, url)
    fetch_contents(book, soup)
    args = {
        "pmab.text.encoding": "gb18030"
    }
    yem.make_book(book, r"E:\tmp", "pmab", **args)