def __init__(self, lang): self.lang = lang self.downloader = Downloader()
def __init__(self, page): self.downloader = Downloader() self.soup = BeautifulSoup(page)
class ZaraScrape(Scraper): BRAND_NAME = 'Zara' PAGE_BASE = 'http://www.zara.com/fr/' def __init__(self, lang): self.lang = lang self.downloader = Downloader() def setConfig(self, section, subsection, productType, bodyPart): self.section = section self.subsection = subsection self.type = productType self.bodies = bodyPart ''' Perfom the scraping on Zara website ''' def run(self, usePlainImage = True, download = False): if download: self.dl_folder = self.DL_FOLDER_PATH_BASE + self.lang + '/' + self.section + '/' + self.subsection + '/' # Create folder if is not existing try: os.makedirs(self.dl_folder) except OSError as exception: if exception.errno != errno.EEXIST: raise log.info('-- Starting scraping --') home = self.downloader.getFile(self.PAGE_BASE + self.lang + '/') browser = ZaraBrowser(home) # Goto first menu level url = browser.getMenuLinkFromName(self.section) try: browser.goTo(url, 5) except: log.warning("Unable to get the page '" + url + "'. Omitting.") return [] # Goto second menu level url = browser.getMenuLinkFromName(self.subsection) try: browser.goTo(url, 5) except: log.warning("Unable to get the page '" + url + "'. Omitting.") return [] # Start items parsing i = 0 itemList = [] for item in browser.getProductsList(): log.debug('zzZZZZzzz') time.sleep(3) # let's do it cool # Goto the product page try: browser.goTo(item['url']) except: log.warning("FAIL : Unable to download '" + item['name'] + "'. Omitting.") continue imgUrl = browser.getProductImageLink(usePlainImage) if imgUrl is None: log.info('FAIL : Unable to get product image for "' + item['name'] + '". Omitting.') continue color = browser.getProductColor() imgFilename = str(i) + '-' + item['name'].replace(' ', '_') imgPath = self.BRAND_NAME + '/' + \ self.lang + '/' + \ self.section + '/' + \ self.subsection + '/' + \ imgFilename + '.jpg' # build a product object product = Product(item['name'], self.BRAND_NAME, color, imgUrl, imgPath, self.type, self.bodies) itemList.append(product) log.info(product.toString()) # Downloading file if flag is True if download: log.info('Downloading ' + imgFilename + '...') self.downloader.writeFile(imgUrl, self.dl_folder + imgFilename) # count the number of object i += 1 log.info('-- Ending scraping --') log.info('-- ' + str(i) + ' images was scraped --') return itemList
class ZaraBrowser(Browser): ''' @param page: Just a string with html code ''' def __init__(self, page): self.downloader = Downloader() self.soup = BeautifulSoup(page) ''' ''' def goTo(self, url, timeRetrying = None): try: page = self.downloader.getFile(url, timeRetrying) except: raise else: self.soup = BeautifulSoup(page) ''' Menu section parsing ''' def getMenu(self, bSubmenu = False): if bSubmenu: menu = self.soup.find(id = 'mainNavigationMenu').find('ul', attrs = {'class': 'bSubmenu'}) else: menu = self.soup.find(id = 'mainNavigationMenu') return menu def getMenuEntries(self, bSubmenu = False): menu = self.getMenu(bSubmenu) entries = menu.find_all('a') return entries def getMenuLinkFromName(self, name): menu = self.getMenu() link = menu.find('a', text = re.compile(r'\s+' + name, re.I)).get('href') return link ''' Products section parsing ''' def getProductsList(self): product_list = self.soup.find(id = 'product-list') product_list_info = product_list.find_all('div', attrs = {'class': 'product-info'}) dummy = [] for product in product_list_info: product_link = product.find('a') dummy.append({'name': product_link.get_text().lower(), 'url': product_link.get('href')}) return dummy ''' Product page parsing ''' def getProductImageLink(self, usePlainImage): if usePlainImage: return self.getProductPlainImageLink() else: return self.getProductFullImageLink() ''' @warning: May do not have a return value @return: 'plain' image or None ''' def getProductPlainImageLink(self): container = self.soup.find('div', attrs = {'class': 'bigImageContainer'}) try: imageSrc = container.find('div', attrs = {'class': 'plain'}) \ .find('img', attrs = {'class': 'image-big'}) \ .get('src') except AttributeError: log.warning('No "plain" image found for this product.') else: if not re.match('^http://', imageSrc, re.I): return 'http:' + imageSrc else: return imageSrc ''' @warning: May do not have a return value @return: 'full' image or None ''' def getProductFullImageLink(self): container = self.soup.find('div', attrs = {'class': 'bigImageContainer'}) try: imageSrc = container.find('div', attrs = {'class': 'full'}) \ .find('img', attrs = {'class': 'image-big'}) \ .get('src') except AttributeError: log.warning('No "full" image found for this product.') else: if not re.match('^http://', imageSrc, re.I): return 'http:' + imageSrc else: return imageSrc def getProductColor(self): container = self.soup.find('form', attrs = {'name': 'itemAdd'}) \ .find('div', attrs = {'class': 'colors'}) \ .find('label', attrs = {'class': 'selected'}) color_name = container.find('span').get_text() color_value = container.get('data-colorcode') return {'name': color_name, 'value': color_value}