Example #1
0
def parse_mph_rss(document, headers, filename=None):
    """Parses the MPH bestsellers RSS feed.
    """
    rss   = BeautifulStoneSoup(document.contents)
    items = rss.findAll('item')
    books = []

    for item in items:
        book = {
            "title": item.find('title'),
            "isbn13": item.find('isbn'),
            "authors": item.find('author'),
            "description": item.find('description')
        }

        for field, element in book.items():
            if element and element.contents:
                value = element.contents[0]
                if isinstance(value, basestring):
                    value = value.strip()
                    if field == 'authors':
                        value = [value,]
                    elif not field == 'description':
                        value = RE_NEWLINE.sub(' ', value)
                    book[field] = value
            else:
                book[field] = None

        book['mph_bestseller'] = True
        book['_links'] = []

        links = {
            "thumbnail": item.find('image'),
            "info": item.find('link'),
        }

        for field, element in links.items():
            if element and element.contents:
                url = element.contents[0]
                if isinstance(url, basestring):
                    book['_links'].append({
                        "name": field,
                        "url": url
                    })

        books.append(book)

    logging.info('parse_mph_rss: %d books found' % len(books))

    if books:
        store_books(books)
Example #2
0
def parse_bookxcess_pdf(document, headers, filename=None, pages=None, debug=False):
    """Parses Bookxcess PDF files
    """
    def _map(item):
        if len(item) != 4:
            return None
        if filename == 'fiction.pdf':
            author, title, price, isbn = item
        else:
            title, author, price, isbn = item
        author = author.title()
        title  = titlecase(title)
        try:
            price = float(price)
        except ValueError:
            return None
        else:
            return {
                "isbn13": isbn,
                "title": title,
                "authors": [author],
                "_prices": [{
                    "source": 'bookxcess',
                    "price": price
                }]
            }

    if not pages:
        pages = range(0, PDF_PROCESS_PAGES)

    result, total = pdf2text.convert(document.contents, pages=pages, mapper=_map)
    logging.info('parse_bookxcess_pdf: %s: pages %s of %d: %d books found' % (filename, pages, total, len(result)))

    if debug:
        return result

    if result:
        store_books(result)

    if total > 1:
        next = pages[-1] + 1
        stop = next + PDF_PROCESS_PAGES
        next_pages = range(next, stop)
        if next < total:
            task_name = 'parse-bookxcess-pdf-%s-%s' % (document.urlhash, '-'.join([str(d) for d in next_pages]))
            logging.info('parse_bookxcess_pdf: next: %s' % task_name)
            try:
                deferred.defer(parse_bookxcess_pdf, document, headers,
                               filename, next_pages, _name=task_name)
            except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError):
                pass
Example #3
0
File: app.py Project: zaim/bukutip
    def _get(self, isbn):
        context = {
            "isbn": isbn,
            "page_title": 'Book %s not found' % isbn,
            "book": None,
            "shops": bookshop.SHOPS.keys()
        }

        # 1) directly from memcache or datastore
        book = models.Book.get_by_isbn(isbn, create=False)
        results = []

        if not book:
            # 2) from "booktmp" cached during session in SearchHandler
            key = 'booktmp:%s' % isbn
            book_dict = memcache.get(key)
            if book_dict:
                results = [book_dict]
                memcache.delete(key)

        # 3) actual google/amazon search
        if (not book and not results) or (book and (not book.google_id or not book.amazon_id)):
            results = booksearch.search('isbn:%s' % isbn)

        # store or update the book
        if results:
            book = models.store_books(results, update=True)
            book = book[0] if book else None

        context['book'] = book
        context['page_title'] = book.title if book else title

        return ('view.html', context)
Example #4
0
    def testUpdateFromBook(self):
        book = self.sample[self.price_sample['mph']['book'].isbn13]
        book['_prices'] = [{
            "source": 'mph',
            "price": 90.5
        }]

        objects = models.store_books([book], update=True)
        returned = len(objects)
        self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned)

        obj = objects[0]
        obj_prices = obj.price_set.count()
        self.assertEqual(obj_prices, 1, 'obj.price_set returned %d entities, expected 1' % obj_prices)

        obj_price = obj.price_set.fetch(1)[0]
        for k, v in book['_prices'][0].iteritems():
            ov = getattr(obj_price, k)
            self.assertEqual(book['_prices'][0][k], ov, "price['%s'] != obj.%s (expected '%s', got '%s')" % (k, k, v, ov))

        # re-test book dupe check, just in case
        all_books = models.Book.all().count()
        expected  = len(self.sample)
        self.assertEqual(all_books, expected, 'book was added, not updated (expected %d books, got %d)' % (expected, all_books))

        all_prices = models.Price.all().count()
        expected = len(self.price_sample)
        self.assertEqual(all_prices, expected, 'expected %d prices, got %d' % (expected, all_prices))
Example #5
0
    def testCreateFromBook(self):
        book = self.sample['9780596806026'] # book 3 has no prices set up
        book['_prices'] = [
            {
                "source": 'times',
                "price": 42.42
            }
        ]

        objects  = models.store_books([book], update=True)
        returned = len(objects)
        self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned)

        obj = objects[0]
        obj_prices = obj.price_set.count()
        self.assertEqual(obj_prices, 1, 'obj.price_set returned %d entities, expected 1' % obj_prices)

        obj_price = obj.price_set.fetch(1)[0]
        for k, v in book['_prices'][0].iteritems():
            self.assertEqual(book['_prices'][0][k], getattr(obj_price, k), 'price[%s] != obj.%s' % (k, k))

        # re-test book dupe check, just in case
        all_books = models.Book.all().count()
        expected  = len(self.sample)
        self.assertEqual(all_books, expected, 'book was added, not updated (expected %d books, got %d)' % (expected, all_books))

        all_prices = models.Price.all().count()
        expected = len(self.price_sample) + 1
        self.assertEqual(all_prices, expected, 'expected %d prices, got %d' % (expected, all_prices))
Example #6
0
    def testCreateAndUpdate(self):
        books = {
            # create:
            "9780596158064": {
                "isbn13": '9780596158064',
                "title": 'Learning Python',
                "authors": ['Mark Lutz']
            },
            # update:
            "9781430224150": {
                "isbn13": '9781430224150',
                "google_id": 'ekrhtG-Hn5IC',
                "title": 'Dive Into Python 3'
            }
        }
        objects = models.store_books(books.values(), update=True)

        returned = len(objects)
        self.assertEqual(returned, 2, 'returned %d entities, expected 2' % returned)

        for obj in objects:
            book = books.get(obj.isbn13)
            for k, v in book.iteritems():
                self.assertEqual(book[k], getattr(obj, k), 'book[%s] != obj.%s' % (k, k))

        all_books = models.Book.all().count()
        expected  = len(self.sample) + 1
        self.assertEqual(all_books, expected, 'expected %d books, got %d' % (expected, all_books))
Example #7
0
File: app.py Project: zaim/bukutip
    def _get(self):
        query = self.request.get('q', '').strip()

        # modify query to do an ISBN: keyword search if given query is one
        if models.Book.ISBN13_REGEX.match(query) or models.Book.ISBN10_REGEX.match(query):
            query = 'isbn:%s' % query

        context = {
            "page_title": u'Search: "%s"' % query,
            "page_id": 'search',
            "query": query,
            "books": []
        }
        hashed  = booksearch.hash_query(query)
        results = booksearch.search(query, hashed=hashed)

        if results:
            # if only has 1 result, immediately save and redirect
            if len(results) == 1:
                # no need to update=True since google book search results
                # rarely change...
                models.store_books(results, update=False)
                self.redirect('/%s' % results[0]['isbn13'])
                return (None, None)

            # convert the resulting dict objects into temporary faux Book
            # entities for use in the template, then defer the actual
            # creation of the entities to BookHandler
            cache = {}
            for res in results:
                res['permalink'] = '/%s' % res['isbn13']
                res['links'] = {}
                for ln in res.get('_links', []):
                    res['links'][ln['name']] = ln['url']
                cache['booktmp:%s' % res['isbn13']] = res

            # temporarily cache the dicts for later saving in BookHandler
            # we give the user 20 minutes to browse/search before
            # clicking a book
            memcache.set_multi(cache, 1200)
            context['books'] = results

        return ('search.html', context)
Example #8
0
    def testCreate(self):
        new = {
            "isbn13": '9780596158064',
            "title": 'Learning Python',
            "authors": ['Mark Lutz']
        }
        objects = models.store_books([new])

        returned = len(objects)
        self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned)

        obj = objects[0]
        self.assertTrue(obj.is_saved(), 'object not saved')
        self.assertEqual(obj.isbn13, new['isbn13'])
        self.assertEqual(obj.title, new['title'])
        self.assertEqual(obj.authors, new['authors'])

        all_books = models.Book.all().count()
        self.assertEqual(all_books, len(self.sample) + 1, 'book not added to datastore')
Example #9
0
    def testUpdate(self):
        book = {
            "isbn13": '9781430224150',
            "google_id": 'ekrhtG-Hn5IC',
            "title": 'Dive Into Python 3'
        }
        objects = models.store_books([book], update=True)

        returned = len(objects)
        self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned)

        obj = objects[0]
        self.assertTrue(obj.is_saved(), 'object not saved')
        self.assertEqual(obj.isbn13, book['isbn13'])
        self.assertEqual(obj.google_id, book['google_id'])
        self.assertEqual(obj.title, book['title'])

        all_books = models.Book.all().count()
        expected  = len(self.sample)
        self.assertEqual(all_books, expected, 'book was added, not updated (expected %d books, got %d)' % (expected, all_books))
Example #10
0
 def setUp(self):
     self.sample  = dict([(b['isbn13'], b) for b in sample_books])
     self.objects = models.store_books(sample_books)