def _spider_book_info(url, letter): try: html = getHttp(url, handleException = False) soup = BeautifulSoup() soup.feed(html) h1 = soup.first("h1") if h1 is None: return None assert h1 is not None title = retrieveContents(h1).decode("iso-8859-1") subtitle = None author = None code = None labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})] data = soup.fetch("span", {"class": "title-data"}) try: index = labels.index("Subtitle") subtitle = retrieveContents(data[index]).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Author") author = retrieveContents(data[index].first("a")).decode("iso-8859-1") except ValueError: pass try: index = labels.index("Language") href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"]) code = href[19:href.find("&", 19)].decode("iso-8859-1") except ValueError: pass tid = soup.first("input", {"type": "hidden", "name": "tid"}) assert tid is not None book_id = tid["value"].decode("iso-8859-1") print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore") sel = soup.first("select", {"name": "book"}) assert sel is not None opts = sel.fetch("option") formats = [] for opt in opts: try: format = retrieveContents(opt).split()[0] if format not in ebooks.FORMATS: continue val = opt["value"] formats.append((format, val)) except Exception, ex: log(SEV_EXC, exceptionAsStr(ex)) formats.sort() return (url, title, subtitle, author, book_id, code, formats)
def _parse_letter_page(self, letter, html, index): self._check_finish() soup = BeautifulSoup() soup.feed(html) div = soup.first("div", {"class": "sidebar-module"}) assert div is not None count = int(retrieveContents(div.contents[2]).split()[2]) offset = 0 self._lock.acquire() try: if count <= self._data[letter][0]: print 'Letter "%s" is up to date (%d records).' % (letter, self._data[letter][0]) return True, count, 0 offset = self._offsets[letter] finally: self._lock.release() spidered = 0 div = soup.first("div", {"class": "titleList"}) assert div is not None as = div.fetch("a") urls = [] for a in as: url = _g_manybooks_url + urllib.quote(a["href"]) urls.append(url) for url in urls: self._check_finish() i = -1 self._lock.acquire() try: books = self._data[letter][1] i = _find_book_index(books, url, index) finally: self._lock.release() if -1 != i: index = i + 1 else: book = _spider_book_info(url, letter) if book is not None: spidered += 1 self._lock.acquire() try: self._fresh_books.append((letter, index + offset, book)) if len(self._fresh_books) == self.flush_after: self._flush_books() offset += 1 self._offsets[letter] = offset if self._data[letter][0] + offset == count: return True, count, spidered finally: self._lock.release() return (index + offset == count), index, spidered