def setUp(self): self.pages = { 'lenarguile': get_file_handle('le-narguile.com.json'), 'royaledeco': get_file_handle('royaledeco.com.json'), '10k00nt': get_file_handle('10k00nt.com.json') } data = [(None, value[1].encode()) for key, value in self.pages.items()] roots, scrape_data = init_data(data) self.s = Scraper(scrape_data)
def setUp(self): self.pages = {'lenarguile' : get_file_handle('le-narguile.com.json'), 'royaledeco' : get_file_handle('royaledeco.com.json'), '10k00nt' : get_file_handle('10k00nt.com.json')} data = [(None, value[1].encode()) for key,value in self.pages.items()] roots, scrape_data = init_data(data) self.s = Scraper(scrape_data)
def test_age(self): with self.assertRaises(ValueError): s = Scraper("any", "any", used=False, new=False, nearlyNew=False) used = Scraper("any", "any", used=True, new=False, nearlyNew=False) self.assertIn('onesearchad=Used', used.url) self.assertNotIn('onesearchad=New', used.url) self.assertNotIn('onesearchad=Nearly%20New', used.url) new = Scraper("any", "any", used=False, new=True, nearlyNew=False) self.assertIn('onesearchad=New', new.url) self.assertNotIn('onesearchad=Nearly%20New', new.url) self.assertNotIn('onesearchad=Used', new.url) nearlyNew = Scraper("any", "any", used=False, new=False, nearlyNew=True) self.assertIn('onesearchad=Nearly%20New', nearlyNew.url) self.assertNotIn('onesearchad=New', nearlyNew.url) self.assertNotIn('onesearchad=Used', nearlyNew.url)
def main(): scraper = Scraper(db_path='./products.db', run_every=config.scraper['run_every']) products = scraper.get_products_list() if len(products) == 0: return False n_workers = config.scraper['workers'] n_groups = int(math.ceil(float(len(products)) / float(n_workers))) groups = list(chunks(products, n_groups)) workers = Queue.Queue() for i in range(n_workers): if i >= n_groups: break wp = WireProtocol(*os.pipe()) pid = os.fork() if pid == 0: scraper.get_prices(wp, groups[i]) sys.exit(0) workers.put((pid, wp)) while not workers.empty(): pid, wp = workers.get() scraper.save_prices(wp) try: os.waitpid(pid, 0) except OSError: pass sys.stdout.flush() sys.stderr.flush() return True
def main(): scraper = Scraper( db_path='./products.db', run_every=config.scraper['run_every'] ) products = scraper.get_products_list() if len(products) == 0: return False n_workers = config.scraper['workers'] n_groups = int(math.ceil(float(len(products)) / float(n_workers))) groups = list(chunks(products, n_groups)) workers = Queue.Queue() for i in range(n_workers): if i >= n_groups: break wp = WireProtocol(*os.pipe()) pid = os.fork() if pid == 0: scraper.get_prices(wp, groups[i]) sys.exit(0) workers.put((pid, wp)) while not workers.empty(): pid, wp = workers.get() scraper.save_prices(wp) try: os.waitpid(pid, 0) except OSError: pass sys.stdout.flush() sys.stderr.flush() return True
class TestDataExtractor(unittest.TestCase): def setUp(self): self.pages = { 'lenarguile': get_file_handle('le-narguile.com.json'), 'royaledeco': get_file_handle('royaledeco.com.json'), '10k00nt': get_file_handle('10k00nt.com.json') } data = [(None, value[1].encode()) for key, value in self.pages.items()] roots, scrape_data = init_data(data) self.s = Scraper(scrape_data) def tearDown(self): for name, f in self.pages.items(): f[0].close() def _assert_data(self, data, price, img_link, name, bc): self.assertTrue(data.get('error', True)) if hasattr(data['prix_css'], '__iter__'): self.assertTrue(price in data['prix_css']) else: self.assertTrue(price == data['prix_css']) if hasattr(data['image_css'], '__iter__'): self.assertTrue(img_link in data['image_css']) else: self.assertTrue(img_link == data['image_css']) if hasattr(data['nom_css'], '__iter__'): self.assertTrue(name in data['nom_css']) else: self.assertTrue(name == data['nom_css']) if hasattr(data['breadcrumb_css'], '__iter__'): self.assertTrue(bc in data['breadcrumb_css']) else: self.assertTrue(bc == data['breadcrumb_css']) def test_file_opened(self): self.assertTrue(self.pages['lenarguile'][0]) def test_scrape_no_data_in_html(self): html = '<html></html>' url = 'www.ex.com' for f, data in self.pages.items(): css_selectors = json.loads(data[1])['selectors'] data = self.s.scrape(url, html) for key, value in data.items(): self.assertFalse(value) def test_scrape_valid_data_in_le_narguile(self): f = get_file_handle('le-narguile.html') html = f[1] f[0].close() url = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/' data = self.s.scrape(url, html) img_link = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/350x350/9df78eab33525d08d6e5fb8d27136e95/p/i/picture_2013_1.jpg' price = '99,00\xa0€' name = 'Narguilé syrien Star argenté de 79 cm' bc = None self._assert_data(data, price, img_link, name, bc) def test_scrape_valid_data_in_royaldeco(self): f = get_file_handle('royaledeco.html') html = f[1] url = 'http://www.royaledeco.com/67686-mainpict/' data = self.s.scrape(url, html) f[0].close() img_link = 'http://www.royaledeco.com/67686-mainpict/fauteuil-galaxy-blanc.jpg' price = '129,00 € TTC' name = 'Fauteuil Galaxy blanc' bc = None self._assert_data(data, price, img_link, name, bc)
def scrape(): """Scrap Puzzles from Site """ scraper = Scraper() scraper.scrape()
class TestDataExtractor(unittest.TestCase): def setUp(self): self.pages = {'lenarguile' : get_file_handle('le-narguile.com.json'), 'royaledeco' : get_file_handle('royaledeco.com.json'), '10k00nt' : get_file_handle('10k00nt.com.json')} data = [(None, value[1].encode()) for key,value in self.pages.items()] roots, scrape_data = init_data(data) self.s = Scraper(scrape_data) def tearDown(self): for name,f in self.pages.items(): f[0].close() def _assert_data(self, data, price, img_link, name, bc): self.assertTrue(data.get('error', True)) if hasattr(data['prix_css'], '__iter__'): self.assertTrue(price in data['prix_css']) else: self.assertTrue(price == data['prix_css']) if hasattr(data['image_css'], '__iter__'): self.assertTrue(img_link in data['image_css']) else: self.assertTrue(img_link == data['image_css']) if hasattr(data['nom_css'], '__iter__'): self.assertTrue(name in data['nom_css']) else: self.assertTrue(name == data['nom_css']) if hasattr(data['breadcrumb_css'], '__iter__'): self.assertTrue(bc in data['breadcrumb_css']) else: self.assertTrue(bc == data['breadcrumb_css']) def test_file_opened(self): self.assertTrue(self.pages['lenarguile'][0]) def test_scrape_no_data_in_html(self): html = '<html></html>' url = 'www.ex.com' for f,data in self.pages.items(): css_selectors = json.loads(data[1])['selectors'] data = self.s.scrape(url, html) for key,value in data.items(): self.assertFalse(value) def test_scrape_valid_data_in_le_narguile(self): f = get_file_handle('le-narguile.html') html = f[1] f[0].close() url = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/' data = self.s.scrape(url, html) img_link = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/350x350/9df78eab33525d08d6e5fb8d27136e95/p/i/picture_2013_1.jpg' price = '99,00\xa0€' name = 'Narguilé syrien Star argenté de 79 cm' bc = None self._assert_data(data, price, img_link, name, bc) def test_scrape_valid_data_in_royaldeco(self): f = get_file_handle('royaledeco.html') html = f[1] url = 'http://www.royaledeco.com/67686-mainpict/' data = self.s.scrape(url, html) f[0].close() img_link = 'http://www.royaledeco.com/67686-mainpict/fauteuil-galaxy-blanc.jpg' price = '129,00 € TTC' name = 'Fauteuil Galaxy blanc' bc = None self._assert_data(data, price, img_link, name, bc)