def parse_mph_rss(document, headers, filename=None): """Parses the MPH bestsellers RSS feed. """ rss = BeautifulStoneSoup(document.contents) items = rss.findAll('item') books = [] for item in items: book = { "title": item.find('title'), "isbn13": item.find('isbn'), "authors": item.find('author'), "description": item.find('description') } for field, element in book.items(): if element and element.contents: value = element.contents[0] if isinstance(value, basestring): value = value.strip() if field == 'authors': value = [value,] elif not field == 'description': value = RE_NEWLINE.sub(' ', value) book[field] = value else: book[field] = None book['mph_bestseller'] = True book['_links'] = [] links = { "thumbnail": item.find('image'), "info": item.find('link'), } for field, element in links.items(): if element and element.contents: url = element.contents[0] if isinstance(url, basestring): book['_links'].append({ "name": field, "url": url }) books.append(book) logging.info('parse_mph_rss: %d books found' % len(books)) if books: store_books(books)
def parse_bookxcess_pdf(document, headers, filename=None, pages=None, debug=False): """Parses Bookxcess PDF files """ def _map(item): if len(item) != 4: return None if filename == 'fiction.pdf': author, title, price, isbn = item else: title, author, price, isbn = item author = author.title() title = titlecase(title) try: price = float(price) except ValueError: return None else: return { "isbn13": isbn, "title": title, "authors": [author], "_prices": [{ "source": 'bookxcess', "price": price }] } if not pages: pages = range(0, PDF_PROCESS_PAGES) result, total = pdf2text.convert(document.contents, pages=pages, mapper=_map) logging.info('parse_bookxcess_pdf: %s: pages %s of %d: %d books found' % (filename, pages, total, len(result))) if debug: return result if result: store_books(result) if total > 1: next = pages[-1] + 1 stop = next + PDF_PROCESS_PAGES next_pages = range(next, stop) if next < total: task_name = 'parse-bookxcess-pdf-%s-%s' % (document.urlhash, '-'.join([str(d) for d in next_pages])) logging.info('parse_bookxcess_pdf: next: %s' % task_name) try: deferred.defer(parse_bookxcess_pdf, document, headers, filename, next_pages, _name=task_name) except (taskqueue.TaskAlreadyExistsError, taskqueue.TombstonedTaskError): pass
def _get(self, isbn): context = { "isbn": isbn, "page_title": 'Book %s not found' % isbn, "book": None, "shops": bookshop.SHOPS.keys() } # 1) directly from memcache or datastore book = models.Book.get_by_isbn(isbn, create=False) results = [] if not book: # 2) from "booktmp" cached during session in SearchHandler key = 'booktmp:%s' % isbn book_dict = memcache.get(key) if book_dict: results = [book_dict] memcache.delete(key) # 3) actual google/amazon search if (not book and not results) or (book and (not book.google_id or not book.amazon_id)): results = booksearch.search('isbn:%s' % isbn) # store or update the book if results: book = models.store_books(results, update=True) book = book[0] if book else None context['book'] = book context['page_title'] = book.title if book else title return ('view.html', context)
def testUpdateFromBook(self): book = self.sample[self.price_sample['mph']['book'].isbn13] book['_prices'] = [{ "source": 'mph', "price": 90.5 }] objects = models.store_books([book], update=True) returned = len(objects) self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned) obj = objects[0] obj_prices = obj.price_set.count() self.assertEqual(obj_prices, 1, 'obj.price_set returned %d entities, expected 1' % obj_prices) obj_price = obj.price_set.fetch(1)[0] for k, v in book['_prices'][0].iteritems(): ov = getattr(obj_price, k) self.assertEqual(book['_prices'][0][k], ov, "price['%s'] != obj.%s (expected '%s', got '%s')" % (k, k, v, ov)) # re-test book dupe check, just in case all_books = models.Book.all().count() expected = len(self.sample) self.assertEqual(all_books, expected, 'book was added, not updated (expected %d books, got %d)' % (expected, all_books)) all_prices = models.Price.all().count() expected = len(self.price_sample) self.assertEqual(all_prices, expected, 'expected %d prices, got %d' % (expected, all_prices))
def testCreateFromBook(self): book = self.sample['9780596806026'] # book 3 has no prices set up book['_prices'] = [ { "source": 'times', "price": 42.42 } ] objects = models.store_books([book], update=True) returned = len(objects) self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned) obj = objects[0] obj_prices = obj.price_set.count() self.assertEqual(obj_prices, 1, 'obj.price_set returned %d entities, expected 1' % obj_prices) obj_price = obj.price_set.fetch(1)[0] for k, v in book['_prices'][0].iteritems(): self.assertEqual(book['_prices'][0][k], getattr(obj_price, k), 'price[%s] != obj.%s' % (k, k)) # re-test book dupe check, just in case all_books = models.Book.all().count() expected = len(self.sample) self.assertEqual(all_books, expected, 'book was added, not updated (expected %d books, got %d)' % (expected, all_books)) all_prices = models.Price.all().count() expected = len(self.price_sample) + 1 self.assertEqual(all_prices, expected, 'expected %d prices, got %d' % (expected, all_prices))
def testCreateAndUpdate(self): books = { # create: "9780596158064": { "isbn13": '9780596158064', "title": 'Learning Python', "authors": ['Mark Lutz'] }, # update: "9781430224150": { "isbn13": '9781430224150', "google_id": 'ekrhtG-Hn5IC', "title": 'Dive Into Python 3' } } objects = models.store_books(books.values(), update=True) returned = len(objects) self.assertEqual(returned, 2, 'returned %d entities, expected 2' % returned) for obj in objects: book = books.get(obj.isbn13) for k, v in book.iteritems(): self.assertEqual(book[k], getattr(obj, k), 'book[%s] != obj.%s' % (k, k)) all_books = models.Book.all().count() expected = len(self.sample) + 1 self.assertEqual(all_books, expected, 'expected %d books, got %d' % (expected, all_books))
def _get(self): query = self.request.get('q', '').strip() # modify query to do an ISBN: keyword search if given query is one if models.Book.ISBN13_REGEX.match(query) or models.Book.ISBN10_REGEX.match(query): query = 'isbn:%s' % query context = { "page_title": u'Search: "%s"' % query, "page_id": 'search', "query": query, "books": [] } hashed = booksearch.hash_query(query) results = booksearch.search(query, hashed=hashed) if results: # if only has 1 result, immediately save and redirect if len(results) == 1: # no need to update=True since google book search results # rarely change... models.store_books(results, update=False) self.redirect('/%s' % results[0]['isbn13']) return (None, None) # convert the resulting dict objects into temporary faux Book # entities for use in the template, then defer the actual # creation of the entities to BookHandler cache = {} for res in results: res['permalink'] = '/%s' % res['isbn13'] res['links'] = {} for ln in res.get('_links', []): res['links'][ln['name']] = ln['url'] cache['booktmp:%s' % res['isbn13']] = res # temporarily cache the dicts for later saving in BookHandler # we give the user 20 minutes to browse/search before # clicking a book memcache.set_multi(cache, 1200) context['books'] = results return ('search.html', context)
def testCreate(self): new = { "isbn13": '9780596158064', "title": 'Learning Python', "authors": ['Mark Lutz'] } objects = models.store_books([new]) returned = len(objects) self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned) obj = objects[0] self.assertTrue(obj.is_saved(), 'object not saved') self.assertEqual(obj.isbn13, new['isbn13']) self.assertEqual(obj.title, new['title']) self.assertEqual(obj.authors, new['authors']) all_books = models.Book.all().count() self.assertEqual(all_books, len(self.sample) + 1, 'book not added to datastore')
def testUpdate(self): book = { "isbn13": '9781430224150', "google_id": 'ekrhtG-Hn5IC', "title": 'Dive Into Python 3' } objects = models.store_books([book], update=True) returned = len(objects) self.assertEqual(returned, 1, 'returned %d entities, expected 1' % returned) obj = objects[0] self.assertTrue(obj.is_saved(), 'object not saved') self.assertEqual(obj.isbn13, book['isbn13']) self.assertEqual(obj.google_id, book['google_id']) self.assertEqual(obj.title, book['title']) all_books = models.Book.all().count() expected = len(self.sample) self.assertEqual(all_books, expected, 'book was added, not updated (expected %d books, got %d)' % (expected, all_books))
def setUp(self): self.sample = dict([(b['isbn13'], b) for b in sample_books]) self.objects = models.store_books(sample_books)