def _getWikiBooks(enWikiUrls): wikiBooks = [] for enWikiUrl in enWikiUrls: print('Getting wiki books ', enWikiUrl) wikiBook = _WikiBook() wikiBook.Pages = [] wikiBookPage = _WikiBookPage() wikiBookPage.Language = 'EN' wikiBookPage.WikiUrl = enWikiUrl wikiBook.Pages.append(wikiBookPage) html = HtmlDownloader.DownloadHtml('en.wikipedia.org', enWikiUrl) soup = BeautifulSoup(html) ruLinkElement = soup.find('li', attrs={'class':'interlanguage-link interwiki-ru'}) if ruLinkElement != None: wikiBookPage = _WikiBookPage() wikiBookPage.Language = 'RU' wikiBookPage.WikiUrl = ruLinkElement.find('a')['href'].replace('//ru.wikipedia.org', '') wikiBook.Pages.append(wikiBookPage) deLinkElement = soup.find('li', attrs={'class':'interlanguage-link interwiki-de'}) if deLinkElement != None: wikiBookPage = _WikiBookPage() wikiBookPage.Language = 'DE' wikiBookPage.WikiUrl = deLinkElement.find('a')['href'].replace('//de.wikipedia.org', '') wikiBook.Pages.append(wikiBookPage) wikiBooks.append(wikiBook) return wikiBooks
def _getLowereadBookPages(bookId): pages = [] pageNum = 1 while True: html = HtmlDownloader.DownloadHtml( 'loveread.ws', '/read_book.php?id=' + bookId + '&p=' + str(pageNum)) if str(pageNum) != _getLowereadPageNumber(html): break pages.append(html) pageNum += 1 return pages
def _getGutenbergSpiegelPages(bookId): pages = [] pageNum = 1 while True: pageHtml = HtmlDownloader.DownloadHtml( 'gutenberg.spiegel.de', '/buch/' + bookId + '/' + str(pageNum)) if pageNum > 1 and _isGutenbergSpiegelPageEmpty(pageHtml): break pages.append(pageHtml) pageNum += 1 return pages
def _getReadCentraChapterPaths(path): chapterPaths = [] html = HtmlDownloader.DownloadHtml('www.readcentral.com', path) soup = BeautifulSoup(html) tdElements = soup.find_all('td', attrs={'class': 'bookindex'}) for tdElement in tdElements: linkElement = tdElement.find('a') if linkElement != None: path = linkElement['href'] chapterPaths.append(path) chapterPaths.sort() return chapterPaths
def _getPathToReadcentralBook(title): path = None titleFirstLetter = title[0] html = HtmlDownloader.DownloadHtml( 'www.readcentral.com', '/read-online-books/' + titleFirstLetter) soup = BeautifulSoup(html) tdElements = soup.find_all('td', attrs={'class': 'bookindex'}) for tdElement in tdElements: linkElement = tdElement.find('a') linkText = linkElement.string.strip() if linkText.lower() == title.lower(): path = linkElement['href'] return path
def _getGutenbergSpiegelBookId(title): bookId = None title = title.lower() indexPageHtml = HtmlDownloader.DownloadHtml('gutenberg.spiegel.de', '/buch') indexPageSoup = BeautifulSoup(indexPageHtml) booksElements = indexPageSoup.find('div', attrs={ 'id': 'spTeaserColumn' }).find_all('a') for bookElement in booksElements: if bookElement.string.strip().lower() == title.lower(): bookId = bookElement['href'].split('/')[2] return bookId
def _getLowereadBookId(title): bookId = None title = title.lower() titleFirstLetter = title[0] html = HtmlDownloader.DownloadHtml( 'loveread.ws', '/letter_nav.php?let=' + str((ord(titleFirstLetter) - ord('а')) + 1)) soup = BeautifulSoup(html) booksElement = soup.find('ul', attrs={'class': 'let_ul'}) if booksElement != None: for bookElement in booksElement.find_all('li'): if bookElement.a.string.strip().lower() == title.lower(): bookId = bookElement.a['href'].split('id=')[1] return bookId
def _getReadcentralBookChapter(path): chapter = Library.BookChapter() chapter.Paragraphs = [] html = HtmlDownloader.DownloadHtml('www.readcentral.com', path) soup = BeautifulSoup(html) pageheadElem = soup.find('div', attrs={'id': 'pagehead'}) chapter.Title = pageheadElem.div.string.strip() contentElement = soup.find( 'div', attrs={'id': 'ctl00_contents_book_chapter_content_area'}) for paragraphElement in contentElement.find_all('p'): paragraphString = ''.join(paragraphElement.strings) #paragraphString = re.sub('<[^>]+>', '', paragraphString) if paragraphString != None: paragraph = Library.BookParagraph() paragraph.Sentences = getEnSentencesFromParagraphString( paragraphString) if len(paragraph.Sentences) != 0: chapter.Paragraphs.append(paragraph) return chapter
def _getBooks(wikiBooks): parallelBooks = [] print(len(wikiBooks)) for i, wikiBook in enumerate(wikiBooks): print(str(i)) parallelBook = ParallelBook() parallelBook.Books = [] for wikiBookPage in wikiBook.Pages: print(wikiBookPage.WikiUrl) book = Book() book.Language = wikiBookPage.Language html = HtmlDownloader.DownloadHtml(wikiBookPage.Language.lower() + '.wikipedia.org', wikiBookPage.WikiUrl) soup = BeautifulSoup(html) headingElement = soup.find('h1', attrs={'id', 'firstHeading'}) book.Title = headingElement.text book.Title = re.sub('\(.+\)', '', book.Title) parallelBook.Books.append(book) parallelBooks.append(parallelBook) return parallelBooks
def _getEnWikiUrls(): wikiUrls = [] # conn = http.client.HTTPConnection("en.wikipedia.org") # conn.request("GET", "/wiki/The_100_Best_Books_of_All_Time") # responce = conn.getresponse() # data = responce.read() # root = ET.fromstring(data.decode('utf-8')) # table = root.find('.//table[@class="wikitable sortable"]') # rows = table.findall('tr') # for row in rows: # print (1) # bookElem = row.find('td/i/a') # if (bookElem != None): # wikiUrl = bookElem.get('href') # wikiUrls.append(wikiUrl) html = HtmlDownloader.DownloadHtml('en.wikipedia.org', '/wiki/100_Classic_Book_Collection') with codecs.open('1.html', 'w', 'utf-8') as target: target.write(html) soup = BeautifulSoup(html) tableElements = soup.find_all('table', attrs={'class':'wikitable sortable'}) for tableElement in tableElements: print(10) rowElements= tableElement.find_all('tr') for rowElement in rowElements: dataElement = rowElement.find('td') if dataElement != None: bookElement = dataElement.find('i').find('a') if bookElement != None: wikiUrl = bookElement['href'] wikiUrls.append(wikiUrl) return list(set(wikiUrls))