Python retrieveContents Examples

Programming Language: Python

Namespace/Package Name: parserUtils

Method/Function: retrieveContents

Examples at hotexamples.com: 2

Python retrieveContents - 2 examples found. These are the top rated real world Python examples of parserUtils.retrieveContents extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: spider_manybooks.py Project: AglaianWoman/moriarty-palm

def _spider_book_info(url, letter):
    try:
        html = getHttp(url, handleException = False)
        soup = BeautifulSoup()
        soup.feed(html)
        h1 = soup.first("h1")
        if h1 is None:
            return None

        assert h1 is not None
        title = retrieveContents(h1).decode("iso-8859-1")

        subtitle = None
        author = None
        code = None

        labels = [retrieveContents(x) for x in soup.fetch("span", {"class": "title-label"})]
        data = soup.fetch("span", {"class": "title-data"})
        try:
            index = labels.index("Subtitle")
            subtitle = retrieveContents(data[index]).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Author")
            author = retrieveContents(data[index].first("a")).decode("iso-8859-1")
        except ValueError:
            pass

        try:
            index = labels.index("Language")
            href = str(data[index].first("a", {"href": "/language.php?code=%"})["href"])
            code = href[19:href.find("&", 19)].decode("iso-8859-1")
        except ValueError:
            pass

        tid = soup.first("input", {"type": "hidden", "name": "tid"})
        assert tid is not None
        book_id = tid["value"].decode("iso-8859-1")

        print (u"%s: \"%s\"" % (author, title)).encode("iso-8859-1", "ignore")

        sel = soup.first("select", {"name": "book"})
        assert sel is not None
        opts = sel.fetch("option")
        formats = []
        for opt in opts:
            try:
                format = retrieveContents(opt).split()[0]
                if format not in ebooks.FORMATS:
                    continue

                val = opt["value"]
                formats.append((format, val))

            except Exception, ex:
                log(SEV_EXC, exceptionAsStr(ex))
        formats.sort()
        return (url, title, subtitle, author, book_id, code, formats)

Example #2

Show file

File: spider_manybooks.py Project: AglaianWoman/moriarty-palm

    def _parse_letter_page(self, letter, html, index):
        self._check_finish()
        soup = BeautifulSoup()
        soup.feed(html)
        div = soup.first("div", {"class": "sidebar-module"})
        assert div is not None
        count = int(retrieveContents(div.contents[2]).split()[2])
        offset = 0
        self._lock.acquire()
        try:
            if count <= self._data[letter][0]:
                print 'Letter "%s" is up to date (%d records).' % (letter, self._data[letter][0])
                return True, count, 0
            offset = self._offsets[letter]
        finally:
            self._lock.release()

        spidered = 0
        div = soup.first("div", {"class": "titleList"})
        assert div is not None
        as = div.fetch("a")
        urls = []
        for a in as:
            url = _g_manybooks_url + urllib.quote(a["href"])
            urls.append(url)

        for url in urls:
            self._check_finish()
            i = -1
            self._lock.acquire()
            try:
                books = self._data[letter][1]
                i = _find_book_index(books, url, index)
            finally:
                self._lock.release()

            if -1 != i:
                index = i + 1
            else:
                book = _spider_book_info(url, letter)
                if book is not None:
                    spidered += 1
                    self._lock.acquire()
                    try:
                        self._fresh_books.append((letter, index + offset, book))
                        if len(self._fresh_books) == self.flush_after:
                            self._flush_books()
                        offset += 1
                        self._offsets[letter] = offset
                        if self._data[letter][0] + offset  == count:
                            return True, count, spidered
                    finally:
                        self._lock.release()
        return (index + offset == count), index, spidered