예제 #1
0
    def setUp(self):
        self.pages = {
            'lenarguile': get_file_handle('le-narguile.com.json'),
            'royaledeco': get_file_handle('royaledeco.com.json'),
            '10k00nt': get_file_handle('10k00nt.com.json')
        }

        data = [(None, value[1].encode()) for key, value in self.pages.items()]
        roots, scrape_data = init_data(data)
        self.s = Scraper(scrape_data)
예제 #2
0
    def setUp(self):
        self.pages = {'lenarguile' : get_file_handle('le-narguile.com.json'),
                      'royaledeco' : get_file_handle('royaledeco.com.json'),
                      '10k00nt' : get_file_handle('10k00nt.com.json')}

        data = [(None, value[1].encode()) for key,value in self.pages.items()]
        roots, scrape_data = init_data(data)
        self.s = Scraper(scrape_data)
예제 #3
0
    def test_age(self):
        with self.assertRaises(ValueError):
            s = Scraper("any", "any", used=False, new=False, nearlyNew=False)
        used = Scraper("any", "any", used=True, new=False, nearlyNew=False)
        self.assertIn('onesearchad=Used', used.url)
        self.assertNotIn('onesearchad=New', used.url)
        self.assertNotIn('onesearchad=Nearly%20New', used.url)

        new = Scraper("any", "any", used=False, new=True, nearlyNew=False)
        self.assertIn('onesearchad=New', new.url)
        self.assertNotIn('onesearchad=Nearly%20New', new.url)
        self.assertNotIn('onesearchad=Used', new.url)

        nearlyNew = Scraper("any",
                            "any",
                            used=False,
                            new=False,
                            nearlyNew=True)
        self.assertIn('onesearchad=Nearly%20New', nearlyNew.url)
        self.assertNotIn('onesearchad=New', nearlyNew.url)
        self.assertNotIn('onesearchad=Used', nearlyNew.url)
예제 #4
0
def main():
    scraper = Scraper(db_path='./products.db',
                      run_every=config.scraper['run_every'])

    products = scraper.get_products_list()
    if len(products) == 0:
        return False

    n_workers = config.scraper['workers']
    n_groups = int(math.ceil(float(len(products)) / float(n_workers)))

    groups = list(chunks(products, n_groups))
    workers = Queue.Queue()

    for i in range(n_workers):
        if i >= n_groups:
            break

        wp = WireProtocol(*os.pipe())
        pid = os.fork()

        if pid == 0:
            scraper.get_prices(wp, groups[i])
            sys.exit(0)

        workers.put((pid, wp))

    while not workers.empty():
        pid, wp = workers.get()
        scraper.save_prices(wp)

        try:
            os.waitpid(pid, 0)
        except OSError:
            pass

    sys.stdout.flush()
    sys.stderr.flush()

    return True
예제 #5
0
def main():
    scraper = Scraper(
        db_path='./products.db',
        run_every=config.scraper['run_every']
    )
    
    products = scraper.get_products_list()
    if len(products) == 0:
        return False
    
    n_workers = config.scraper['workers']
    n_groups = int(math.ceil(float(len(products)) / float(n_workers)))
    
    groups = list(chunks(products, n_groups))
    workers = Queue.Queue()
    
    for i in range(n_workers):
        if i >= n_groups:
            break
    
        wp = WireProtocol(*os.pipe())
        pid = os.fork()
    
        if pid == 0:
            scraper.get_prices(wp, groups[i])
            sys.exit(0)
        
        workers.put((pid, wp))
    
    while not workers.empty():
        pid, wp = workers.get()
        scraper.save_prices(wp)
        
        try:
            os.waitpid(pid, 0)
        except OSError:
            pass

    sys.stdout.flush()
    sys.stderr.flush()
    
    return True
예제 #6
0
class TestDataExtractor(unittest.TestCase):
    def setUp(self):
        self.pages = {
            'lenarguile': get_file_handle('le-narguile.com.json'),
            'royaledeco': get_file_handle('royaledeco.com.json'),
            '10k00nt': get_file_handle('10k00nt.com.json')
        }

        data = [(None, value[1].encode()) for key, value in self.pages.items()]
        roots, scrape_data = init_data(data)
        self.s = Scraper(scrape_data)

    def tearDown(self):
        for name, f in self.pages.items():
            f[0].close()

    def _assert_data(self, data, price, img_link, name, bc):
        self.assertTrue(data.get('error', True))
        if hasattr(data['prix_css'], '__iter__'):
            self.assertTrue(price in data['prix_css'])
        else:
            self.assertTrue(price == data['prix_css'])

        if hasattr(data['image_css'], '__iter__'):
            self.assertTrue(img_link in data['image_css'])
        else:
            self.assertTrue(img_link == data['image_css'])

        if hasattr(data['nom_css'], '__iter__'):
            self.assertTrue(name in data['nom_css'])
        else:
            self.assertTrue(name == data['nom_css'])

        if hasattr(data['breadcrumb_css'], '__iter__'):
            self.assertTrue(bc in data['breadcrumb_css'])
        else:
            self.assertTrue(bc == data['breadcrumb_css'])

    def test_file_opened(self):
        self.assertTrue(self.pages['lenarguile'][0])

    def test_scrape_no_data_in_html(self):
        html = '<html></html>'
        url = 'www.ex.com'
        for f, data in self.pages.items():
            css_selectors = json.loads(data[1])['selectors']
            data = self.s.scrape(url, html)
            for key, value in data.items():
                self.assertFalse(value)

    def test_scrape_valid_data_in_le_narguile(self):
        f = get_file_handle('le-narguile.html')
        html = f[1]
        f[0].close()
        url = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/'
        data = self.s.scrape(url, html)
        img_link = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/350x350/9df78eab33525d08d6e5fb8d27136e95/p/i/picture_2013_1.jpg'
        price = '99,00\xa0€'
        name = 'Narguilé syrien Star argenté de 79 cm'
        bc = None
        self._assert_data(data, price, img_link, name, bc)

    def test_scrape_valid_data_in_royaldeco(self):
        f = get_file_handle('royaledeco.html')
        html = f[1]
        url = 'http://www.royaledeco.com/67686-mainpict/'
        data = self.s.scrape(url, html)
        f[0].close()
        img_link = 'http://www.royaledeco.com/67686-mainpict/fauteuil-galaxy-blanc.jpg'
        price = '129,00 € TTC'
        name = 'Fauteuil Galaxy blanc'
        bc = None
        self._assert_data(data, price, img_link, name, bc)
예제 #7
0
def scrape():
    """Scrap Puzzles from Site
    """
    scraper = Scraper()
    scraper.scrape()
예제 #8
0
class TestDataExtractor(unittest.TestCase):

    def setUp(self):
        self.pages = {'lenarguile' : get_file_handle('le-narguile.com.json'),
                      'royaledeco' : get_file_handle('royaledeco.com.json'),
                      '10k00nt' : get_file_handle('10k00nt.com.json')}

        data = [(None, value[1].encode()) for key,value in self.pages.items()]
        roots, scrape_data = init_data(data)
        self.s = Scraper(scrape_data)
        
    def tearDown(self):
        for name,f in self.pages.items():
            f[0].close()

    def _assert_data(self, data, price, img_link, name, bc):
        self.assertTrue(data.get('error', True))
        if  hasattr(data['prix_css'], '__iter__'):
            self.assertTrue(price in data['prix_css'])
        else:
            self.assertTrue(price == data['prix_css'])
            
        if  hasattr(data['image_css'], '__iter__'):
            self.assertTrue(img_link in data['image_css'])
        else:
            self.assertTrue(img_link == data['image_css'])

        if  hasattr(data['nom_css'], '__iter__'):
            self.assertTrue(name in data['nom_css'])
        else:
            self.assertTrue(name == data['nom_css'])

        if  hasattr(data['breadcrumb_css'], '__iter__'):
            self.assertTrue(bc in data['breadcrumb_css'])
        else:
            self.assertTrue(bc == data['breadcrumb_css'])

    def test_file_opened(self):
        self.assertTrue(self.pages['lenarguile'][0])

    def test_scrape_no_data_in_html(self):
        html = '<html></html>'
        url = 'www.ex.com'
        for f,data in self.pages.items():
            css_selectors = json.loads(data[1])['selectors']
            data = self.s.scrape(url, html)
            for key,value in data.items():
                self.assertFalse(value)

    def test_scrape_valid_data_in_le_narguile(self):
        f = get_file_handle('le-narguile.html')
        html = f[1]
        f[0].close()
        url = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/'
        data = self.s.scrape(url, html)
        img_link = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/350x350/9df78eab33525d08d6e5fb8d27136e95/p/i/picture_2013_1.jpg'
        price = '99,00\xa0€'
        name = 'Narguilé syrien Star argenté de 79 cm'
        bc = None
        self._assert_data(data, price, img_link, name, bc)
    
    def test_scrape_valid_data_in_royaldeco(self):
        f = get_file_handle('royaledeco.html')
        html = f[1]
        url = 'http://www.royaledeco.com/67686-mainpict/'
        data = self.s.scrape(url, html)
        f[0].close()
        img_link = 'http://www.royaledeco.com/67686-mainpict/fauteuil-galaxy-blanc.jpg'
        price = '129,00 € TTC'
        name = 'Fauteuil Galaxy blanc'
        bc = None
        self._assert_data(data, price, img_link, name, bc)