def test_scrapes_search_results2(self): response = retrieve_response(os.path.join(fixtures_path, 'search2')) self.assertIsNotNone(response) results_file = os.path.join(fixtures_path, 'search2_results/results.json') scraper = AmazonScraper() result = scraper.scrape_search_results_page(response) expected = json.load(open(results_file)) # test next page url, current page number if expected['next_url'] is None: self.assertIsNone(result['next_url']) else: self.assertEqual(result['next_url'], expected['next_url']) if expected['current_page'] is None: self.assertIsNone(result['current_page']) else: self.assertEqual(result['current_page'], expected['current_page']) if expected['results_count'] is None: self.assertIsNone(result['results_count']) else: self.assertEqual(result['results_count'], expected['results_count']) # check links to suggested searches for link in expected['suggested_search_urls']: self.assertIn(link, result['suggested_search_urls']) # check number of products and suggested products matches self.assertEqual(len(expected['products']), len(result['products'])) self.assertEqual(len(expected['suggested_products']), len(result['suggested_products'])) for product in expected['suggested_products']: found_products = [x for x in result['suggested_products'] if x['identifier'] == product['identifier']] self.assertGreater(len(found_products), 0, "Not found product with identifier: %s" % product['identifier']) self.assertLess(len(found_products), 2, "Found more than 1 product with identifier: %s" % product['identifier']) found_product = found_products.pop() for key, value in product.items(): if key == 'price': self.assertEqual(found_product[key], Decimal(value), "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key])) else: self.assertEqual(found_product[key], value, "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key])) for product in expected['products']: found_products = [x for x in result['products'] if x['identifier'] == product['identifier']] self.assertGreater(len(found_products), 0, "Not found product with identifier: %s" % product['identifier']) self.assertLess(len(found_products), 2, "Found more than 1 product with identifier: %s" % product['identifier']) found_product = found_products.pop() for key, value in product.items(): if key == 'price': self.assertEqual(found_product[key], Decimal(value), "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key])) else: self.assertEqual(found_product[key], value, "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key]))
def test_empties_mbc_urls_for_kindle_books(self): scraper = AmazonScraper() response = retrieve_response(os.path.join(fixtures_path, 'non_kindle_book')) self.assertIsNotNone(response) res = scraper.scrape_product_details_page(response) self.assertFalse(scraper.is_kindle_book(response)) self.assertTrue(len(res['mbc_list_url_new']) > 0) self.assertTrue(len(res['mbc_list_url_used']) > 0) response = retrieve_response(os.path.join(fixtures_path, 'kindle_book')) self.assertIsNotNone(response) self.assertTrue(scraper.is_kindle_book(response)) res = scraper.scrape_product_details_page(response) self.assertIsNone(res['mbc_list_url_new']) self.assertIsNone(res['mbc_list_url_used'])
def test_parse_mbc_list(self): """ Checks that "parse_mbc_list" method does not take more than 1 ms to execute. There are different configurations of spiders affecting method run, we test them all. """ types = ['amazon_direct', 'only_buybox', 'all_sellers'] review_types = [True, False] for spider_type, reviews_enabled in product(types, review_types): spider = self._build_spider(spider_type, reviews_enabled) response = retrieve_response( os.path.join(fixtures_path, 'mbc_list')) request = Request( 'http://www.amazon.co.uk/gp/offer-listing/B002EEP3NO/ref=sr_1_1_olp/' '279-2607573-2864855?ie=UTF8&qid=1395239715&sr=8-1&keywords=LEGO+10188&condition=new', callback=spider.parse_product_list) response.request = request def __under_test(): spider.parse_mbc_list(response) repeats = 100000 time = timeit.timeit(__under_test, number=repeats) self.assertLess( time / repeats, 0.0001, "Too slow in spider {} with reviews {}".format( spider_type, 'enabled' if reviews_enabled else 'disabled'))
def test_parse_products_list(self): """ Checks that "parse_product_list" method does not take more than 1 ms to execute. There are different configurations of spiders affecting method run, we test them all. """ types = ['amazon_direct', 'only_buybox', 'all_sellers'] review_types = [True, False] for spider_type, reviews_enabled in product(types, review_types): spider = self._build_spider(spider_type, reviews_enabled) response = retrieve_response(os.path.join(fixtures_path, 'search1')) request = Request( 'http://www.amazon.co.uk/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=LEGO%2010188', callback=spider.parse_product_list) response.request = request def __under_test(): spider.parse_product_list(response) repeats = 100000 time = timeit.timeit(__under_test, number=repeats) self.assertLess( time / repeats, 0.0001, "Too slow in spider {} with reviews {}".format( spider_type, 'enabled' if reviews_enabled else 'disabled'))
def test_scrapes_product_details(self): response = retrieve_response(os.path.join(fixtures_path, 'product')) self.assertIsNotNone(response) results_file = os.path.join(fixtures_path, 'product_results/results.json') scraper = AmazonScraper() result = scraper.scrape_product_details_page(response) expected = json.load(open(results_file)) for key, value in expected.items(): if not isinstance(value, list): self.assertIn(key, result.keys()) if value is None: self.assertIsNone(result[key]) else: if key == 'price': if '-' in value: low, high = value.split("-") low = Decimal(low) high = Decimal(high) self.assertEqual(result[key], (low, high), "Field %s does not match: expected '%s', found '%s'" % (key, value, result[key])) else: self.assertEqual(result[key], Decimal(value), "Field %s does not match: expected '%s', found '%s'" % (key, value, result[key])) else: self.assertEqual(result[key], value, "Field %s does not match: expected '%s', found '%s'" % (key, value, result[key])) self.assertEqual(len(expected['option_texts']), len(result['option_texts'])) for option_text in expected['option_texts']: self.assertIn(option_text, result['option_texts']) self.assertEqual(len(expected['options']), len(result['options'])) for option in expected['options']: found_options = [x for x in result['options'] if x['identifier'] == option['identifier']] self.assertGreater(len(found_options), 0, "Not found option with identifier: %s" % option['identifier']) self.assertLess(len(found_options), 2, "Found more than 1 option with identifier: %s" % option['identifier']) found_product = found_options.pop() for key, value in option.items(): if key == 'price': if '-' in value: low, high = value.split("-") low = Decimal(low) high = Decimal(high) self.assertEqual(found_product[key], (low, high), "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key])) else: self.assertEqual(found_product[key], Decimal(value), "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key])) else: self.assertEqual(found_product[key], value, "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key]))
def test_scrapes_reviews(self): response = retrieve_response(os.path.join(fixtures_path, 'reviews_list')) self.assertIsNotNone(response) results_file = os.path.join(fixtures_path, 'reviews_list_results/results.json') scraper = AmazonScraper() result = scraper.scrape_reviews_list_page(response) expected = json.load(open(results_file)) for review in expected['reviews']: review['date'] = datetime.datetime.strptime(review['date'], "%Y-%m-%dT%H:%M:%S") # test next page url, current page number if expected['next_url'] is None: self.assertIsNone(result['next_url']) else: self.assertEqual(result['next_url'], expected['next_url']) if expected['current_page'] is None: self.assertIsNone(result['current_page']) else: self.assertEqual(result['current_page'], expected['current_page']) # check number of products and suggested products matches self.assertEqual(len(expected['reviews']), len(result['reviews'])) for review in expected['reviews']: found_products = [x for x in result['reviews'] if x['identifier'] == review['identifier']] self.assertGreater(len(found_products), 0, "Not found product with identifier: %s" % review['identifier']) self.assertLess(len(found_products), 2, "Found more than 1 product with identifier: %s" % review['identifier']) found_product = found_products.pop() for key, value in review.items(): if key == 'price': self.assertEqual(found_product[key], Decimal(value), "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key])) else: self.assertEqual(found_product[key], value, "Field %s does not match: expected '%s', found '%s'" % (key, value, found_product[key]))