Exemple #1
0
def _getGutenbergSpiegelChapters(pages):
    chapters = []

    for page in pages:
        soup = BeautifulSoup(page)
        chapter = Library.BookChapter()
        titleElement = soup.find('div', attrs={'id': 'gutenb'}).find('h3')
        if titleElement != None:
            chapter.Title = ''.join(titleElement.strings)
        paragraphElements = soup.find('div', attrs={
            'id': 'gutenb'
        }).find_all('p')
        for paragraphElement in paragraphElements:
            paragraph = Library.BookParagraph()
            paragraphString = ''.join(paragraphElement.strings)
            if paragraphString != None:
                sentences = paragraphString.replace('!', '.').replace(
                    '?', '.').split('.')
                paragraph.Sentences = [
                    s.strip() for s in sentences if s.strip() != ''
                ]
                if len(paragraph.Sentences) != 0:
                    chapter.Paragraphs.append(paragraph)
        if len(chapter.Paragraphs) != 0 and chapter.Title != None:
            chapters.append(chapter)

    return chapters
Exemple #2
0
def _getLowereadBookChapters(pages):
    chapters = []

    currentChapter = Library.BookChapter()
    currentChapter.Title = ''
    currentChapter.Paragraphs = []

    for page in pages:
        soup = BeautifulSoup(page)

        readBookElement = soup.find('td', attrs={'class': 'tb_read_book'})
        if readBookElement != None:
            pageContentElement = readBookElement.find(
                'div', attrs={
                    'class': 'MsoNormal'
                }).find('p', attrs={'class': 'MsoNormal'})
            for child in pageContentElement.find_all(re.compile('p|div')):
                if child.name == 'div':
                    if child.string != None and child.string.strip() != '':
                        if len(currentChapter.Paragraphs) != 0:
                            chapters.append(currentChapter)
                        currentChapter = Library.BookChapter()
                        currentChapter.Title = child.string
                if child.name == 'p':
                    paragraphString = ''.join(child.strings)
                    if paragraphString.strip() != '':
                        paragraphString = paragraphString.replace(
                            os.linesep, ' ')
                        paragraph = Library.BookParagraph()
                        paragraph.Sentences = []
                        sentences = paragraphString.replace('!', '.').replace(
                            '?', '.').split('.')
                        sentences = [s.strip() for s in sentences]
                        for sentence in sentences:
                            if sentence != "":
                                paragraph.Sentences.append(sentence)
                        if len(paragraph.Sentences) != 0:
                            currentChapter.Paragraphs.append(paragraph)

    return chapters
Exemple #3
0
def _getReadcentralBookChapter(path):
    chapter = Library.BookChapter()
    chapter.Paragraphs = []

    html = HtmlDownloader.DownloadHtml('www.readcentral.com', path)

    soup = BeautifulSoup(html)
    pageheadElem = soup.find('div', attrs={'id': 'pagehead'})
    chapter.Title = pageheadElem.div.string.strip()

    contentElement = soup.find(
        'div', attrs={'id': 'ctl00_contents_book_chapter_content_area'})
    for paragraphElement in contentElement.find_all('p'):
        paragraphString = ''.join(paragraphElement.strings)
        #paragraphString = re.sub('<[^>]+>', '', paragraphString)
        if paragraphString != None:
            paragraph = Library.BookParagraph()
            paragraph.Sentences = getEnSentencesFromParagraphString(
                paragraphString)
            if len(paragraph.Sentences) != 0:
                chapter.Paragraphs.append(paragraph)

    return chapter