def parseProductTitle(self, item, ad): title = item.find('h2') if not title: raise utility.CrawlerError('Product title is missing') if title.get('data-attribute') is None: raise utility.CrawlerError('Product title is missing') ad['title'] = title['data-attribute']
def parseProductLink(self, item, ad): href = item.find('a', class_='a-link-normal a-text-normal') if not href: raise utility.CrawlerError('Product format not correct', 0) href = href.get('href') href = utility.normalizeUrl(href) ad['detail_url'] = href if href in self.visitedUrl: raise utility.CrawlerError('Product link already exists') else: self.visitedUrl.add(href)
def parseProductBrand(self, item, ad): brand = item.find_all('span', class_='a-size-small a-color-secondary') if len(brand) < 1: raise utility.CrawlerError('Product brand is missing') brand = brand[1].string ad['brand'] = brand
def parseProductKeywords(self, item, ad): ad['keywords'] = utility.cleanedTokenize(ad['title']) if len(ad['keywords']) < 1: raise utility.CrawlerError('Lack of keywords')