def setUpClass(cls): '''Sets up saved pages for all urls in URLS, and loads them to be easily accessible for tests. The most important thing happening here is population of cls.rows. Here's what's happening: - at the start, cls.rows contains only the item and image urls from the csv file - since my code will load a file from disk from each test case, I pre-load it into cls.rows. The code appends after the second element a list of objects the test will use (e.g. etsy listing object and etsy seller object) - finally, I append to that row the appropriate scraper object, this way I don't have to keep calling constructors in my code, instead I can simply retrieve the last element. Note: the only requirement here is that the writer of the test knows which row to use for each test ''' cls.cur_dir = os.getcwd() reader = sopen(TEST_URLS) scraper = Scraper() chdir(DOWNLOAD_DIR) cls.rows = [] for row in reader: row = row.split(",", 1) domain = get_domain(row[0]) scraper = scraper.get_scraper(domain) if (REWRITE or exists(DOWNLOAD_DIR) is False): scraper.download(row[0]) row.extend(scraper.load(row[0])) row.append(scraper) cls.rows.append(row) cls._test_is_set = True
def test_amazon_scraper(self): '''Test get_item_info for AmazonScraper ''' scraper = Scraper() scraper = scraper.get_scraper('www.amazon.com') item = scraper.get_item_info('http://www.amazon.com/gp/product/B002P8T0L0/ref=s9_simh_gw_p23_d0_g23_i1?pf_rd_m=ATVPDKIKX0DER&pf_rd_s=center-2&pf_rd_r=0WQ1VFHRSY7ZTB93FGYG&pf_rd_t=101&pf_rd_p=470938631&pf_rd_i=507846','http://ecx.images-amazon.com/images/I/31hak2cSIOL.jpg') self.assertEqual(item.price, 75.99) self.assertEqual(item.currency_code, '$') self.assertEqual(item.user_interaction.likes, 42) self.assertEqual(item.quantity.new, 5) self.assertEqual(item.details.discount.value, 43.96)
def test_amazon_scraper(self): '''Test get_item_info for AmazonScraper ''' scraper = Scraper() scraper = scraper.get_scraper('www.amazon.com') item = scraper.get_item_info( 'http://www.amazon.com/gp/product/B002P8T0L0/ref=s9_simh_gw_p23_d0_g23_i1?pf_rd_m=ATVPDKIKX0DER&pf_rd_s=center-2&pf_rd_r=0WQ1VFHRSY7ZTB93FGYG&pf_rd_t=101&pf_rd_p=470938631&pf_rd_i=507846', 'http://ecx.images-amazon.com/images/I/31hak2cSIOL.jpg') self.assertEqual(item.price, 75.99) self.assertEqual(item.currency_code, '$') self.assertEqual(item.user_interaction.likes, 42) self.assertEqual(item.quantity.new, 5) self.assertEqual(item.details.discount.value, 43.96)
def _pinscraperow(row, row_num): scraper = Scraper() url = row[0].strip() img_url = row[1].strip() dir_name = urllib.parse.quote_plus(url) mkdir(dir_name) download_image(img_url, dir_name) domain = get_domain(url) scraper = scraper.get_scraper(domain) if (scraper): print("Getting information from {0}... ".format(domain)) content = scraper.get_item_info(url, img_url) if (content): json_dump_to_file('{0}/info.json'.format(dir_name), content) else: write_to_file('{0}/not_found.txt'.format(dir_name), 'w', 'The url at {0} was not found'.format(url)) return True else: return domain