Exemple #1
0
def scrape():
    """Scrap Puzzles from Site
    """
    scraper = Scraper()
    scraper.scrape()
class TestDataExtractor(unittest.TestCase):
    def setUp(self):
        self.pages = {
            'lenarguile': get_file_handle('le-narguile.com.json'),
            'royaledeco': get_file_handle('royaledeco.com.json'),
            '10k00nt': get_file_handle('10k00nt.com.json')
        }

        data = [(None, value[1].encode()) for key, value in self.pages.items()]
        roots, scrape_data = init_data(data)
        self.s = Scraper(scrape_data)

    def tearDown(self):
        for name, f in self.pages.items():
            f[0].close()

    def _assert_data(self, data, price, img_link, name, bc):
        self.assertTrue(data.get('error', True))
        if hasattr(data['prix_css'], '__iter__'):
            self.assertTrue(price in data['prix_css'])
        else:
            self.assertTrue(price == data['prix_css'])

        if hasattr(data['image_css'], '__iter__'):
            self.assertTrue(img_link in data['image_css'])
        else:
            self.assertTrue(img_link == data['image_css'])

        if hasattr(data['nom_css'], '__iter__'):
            self.assertTrue(name in data['nom_css'])
        else:
            self.assertTrue(name == data['nom_css'])

        if hasattr(data['breadcrumb_css'], '__iter__'):
            self.assertTrue(bc in data['breadcrumb_css'])
        else:
            self.assertTrue(bc == data['breadcrumb_css'])

    def test_file_opened(self):
        self.assertTrue(self.pages['lenarguile'][0])

    def test_scrape_no_data_in_html(self):
        html = '<html></html>'
        url = 'www.ex.com'
        for f, data in self.pages.items():
            css_selectors = json.loads(data[1])['selectors']
            data = self.s.scrape(url, html)
            for key, value in data.items():
                self.assertFalse(value)

    def test_scrape_valid_data_in_le_narguile(self):
        f = get_file_handle('le-narguile.html')
        html = f[1]
        f[0].close()
        url = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/'
        data = self.s.scrape(url, html)
        img_link = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/350x350/9df78eab33525d08d6e5fb8d27136e95/p/i/picture_2013_1.jpg'
        price = '99,00\xa0€'
        name = 'Narguilé syrien Star argenté de 79 cm'
        bc = None
        self._assert_data(data, price, img_link, name, bc)

    def test_scrape_valid_data_in_royaldeco(self):
        f = get_file_handle('royaledeco.html')
        html = f[1]
        url = 'http://www.royaledeco.com/67686-mainpict/'
        data = self.s.scrape(url, html)
        f[0].close()
        img_link = 'http://www.royaledeco.com/67686-mainpict/fauteuil-galaxy-blanc.jpg'
        price = '129,00 € TTC'
        name = 'Fauteuil Galaxy blanc'
        bc = None
        self._assert_data(data, price, img_link, name, bc)
class TestDataExtractor(unittest.TestCase):

    def setUp(self):
        self.pages = {'lenarguile' : get_file_handle('le-narguile.com.json'),
                      'royaledeco' : get_file_handle('royaledeco.com.json'),
                      '10k00nt' : get_file_handle('10k00nt.com.json')}

        data = [(None, value[1].encode()) for key,value in self.pages.items()]
        roots, scrape_data = init_data(data)
        self.s = Scraper(scrape_data)
        
    def tearDown(self):
        for name,f in self.pages.items():
            f[0].close()

    def _assert_data(self, data, price, img_link, name, bc):
        self.assertTrue(data.get('error', True))
        if  hasattr(data['prix_css'], '__iter__'):
            self.assertTrue(price in data['prix_css'])
        else:
            self.assertTrue(price == data['prix_css'])
            
        if  hasattr(data['image_css'], '__iter__'):
            self.assertTrue(img_link in data['image_css'])
        else:
            self.assertTrue(img_link == data['image_css'])

        if  hasattr(data['nom_css'], '__iter__'):
            self.assertTrue(name in data['nom_css'])
        else:
            self.assertTrue(name == data['nom_css'])

        if  hasattr(data['breadcrumb_css'], '__iter__'):
            self.assertTrue(bc in data['breadcrumb_css'])
        else:
            self.assertTrue(bc == data['breadcrumb_css'])

    def test_file_opened(self):
        self.assertTrue(self.pages['lenarguile'][0])

    def test_scrape_no_data_in_html(self):
        html = '<html></html>'
        url = 'www.ex.com'
        for f,data in self.pages.items():
            css_selectors = json.loads(data[1])['selectors']
            data = self.s.scrape(url, html)
            for key,value in data.items():
                self.assertFalse(value)

    def test_scrape_valid_data_in_le_narguile(self):
        f = get_file_handle('le-narguile.html')
        html = f[1]
        f[0].close()
        url = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/'
        data = self.s.scrape(url, html)
        img_link = 'http://www.le-narguile.com/media/catalog/product/cache/6/image/350x350/9df78eab33525d08d6e5fb8d27136e95/p/i/picture_2013_1.jpg'
        price = '99,00\xa0€'
        name = 'Narguilé syrien Star argenté de 79 cm'
        bc = None
        self._assert_data(data, price, img_link, name, bc)
    
    def test_scrape_valid_data_in_royaldeco(self):
        f = get_file_handle('royaledeco.html')
        html = f[1]
        url = 'http://www.royaledeco.com/67686-mainpict/'
        data = self.s.scrape(url, html)
        f[0].close()
        img_link = 'http://www.royaledeco.com/67686-mainpict/fauteuil-galaxy-blanc.jpg'
        price = '129,00 € TTC'
        name = 'Fauteuil Galaxy blanc'
        bc = None
        self._assert_data(data, price, img_link, name, bc)