def parse(self, response): with SeleniumBrowser(self, response) as browser: selector = browser.get(response.url) category_path = selector.xpath( '//ul[contains(@class, "grouplist__group")]') for categories in category_path: path = self.extract( categories.xpath( './li[@class="grouplist__group__title"]/text()')) leaves = categories.xpath( './li[@class="grouplist__group__item"]') for leaf in leaves: category = CategoryItem() category['category_leaf'] = self.extract( leaf.xpath('.//text()')) category[ 'category_path'] = 'Home > ' + path + ' > ' + category[ 'category_leaf'] category['category_url'] = get_full_url( response.url, self.extract(leaf.xpath('./a/@href'))) yield category try: for item in self.parse_categories(browser, category['category_url'], category): yield item except: pass
def level_4(self, response): original_url = response.url with SeleniumBrowser(self, response) as browser: wait_for = None wait_type, wait_for_xpath = "wait_none", "" if wait_for_xpath: wait_for = EC.wait_none((By.XPATH, wait_for_xpath)) browser.get(response.url) while True: try: selector = browser.click_link( "//button[@id='load-more-results']", wait_for) response = selector.response except: break urls_xpath = "//div[@class='product-info']/a/@href" urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_5) yield request
def parse(self, response): category_xpaths = { "category_leaf": "(//ul[@class='ulNavigationBreadcrumb']/li/a)[last()]/text()", "category_path": "(//ul[@class='ulNavigationBreadcrumb']/li/a)[last()]/text()" } product_xpaths = { "PicURL": "(//ul[@class='thumbsBox']/li/a)[1]/@href", "ProductName": "//h1[@itemprop='name']/text()" } category = self.init_item_by_xpaths(response, "category", category_xpaths) product = self.init_item_by_xpaths(response, "product", product_xpaths) product["OriginalCategoryName"] = category["category_path"] product["PicURL"] = get_full_url(response.url, product["PicURL"]) yield category yield product yield self.get_rm_kidval(product, response) reviews_xpath = "//a[@id='tabRating']" with SeleniumBrowser(self, response) as browser: browser.get(response.url) browser.scroll(200) # click auto scroll does not work for some selector = browser.click(reviews_xpath) for review in self._parse_reviews(selector, browser, product): yield review
def parse(self, response): all_review_button_xpath = "//a[contains(@class,'seeAllReviews')]" soup = BeautifulSoup(response.body, "lxml") #inspect_response(response, self) item_id = response.url.split('/')[-2].strip() product = ProductItem() product['source_internal_id'] = item_id product['ProductName'] = soup.find('span', { 'itemprop': 'name' }).text.strip() product['ProductManufacturer'] = soup.find('span', { 'itemprop': 'manufacturer' }).text.strip() ocn = [] ocn_paths = soup.find('ul', { 'class': 'Breadcrumb-list' }).find_all('span', {'itemprop': 'title'}) for item in ocn_paths: ocn.append(item.text.strip()) product['OriginalCategoryName'] = ' > '.join(ocn) product['PicURL'] = soup.find( 'img', {'class': 'js-ProductVisuals-imagePreview'})['src'].strip() product['TestUrl'] = response.url yield product yield self.get_rm_kidval(product, response) with SeleniumBrowser(self, response) as browser: selector = browser.get(response.url) all_review_button = response.xpath(all_review_button_xpath) if all_review_button: selector = browser.click( "//a[contains(@class,'seeAllReviews')]") for review in self._parse_reviews(selector, product, browser): yield review
def parse_product(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] product["ProductManufacturer"] = self.extract(response.xpath('//a[@class="brand"]/text()')) product['PicURL'] = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['source_internal_id'] = self.extract(response.xpath('//div[@id="pdpFRdivMain"]/@data-productid')) mpn = self.extract(response.xpath( '//dt[@data-cerberus="txt_pdp_sizetitle"]/parent::dl/dd[not(contains(text(),"Taille"))]/text()')) if mpn: product['ProductName'] = product["ProductManufacturer"] + ' ' + mpn product_id = self.product_id(product) product_id['ID_kind'] = "MPN" product_id['ID_value'] = mpn yield product_id else: name = self.extract(response.xpath('//h2[@itemprop="name"]/text()')) product['ProductName'] = product["ProductManufacturer"] + ' ' + name yield product review_url = self.extract(response.xpath('//a[@class="read-reviews"]/@href')) review_url = get_full_url(response, review_url) with SeleniumBrowser(self, response) as browser: selector = browser.get(review_url, timeout=10) response.meta['browser'] = browser response.meta['product'] = product response.meta['_been_in_decorator'] = True for review in self.parse_reviews(response, selector, incremental=True): yield review
def parse(self, response): iframe_xpath = "//iframe[@id='mainframe']" review_url_xpath = "//div[@class='title']/a/@href" review_re = 'magazine/\d+/\d+/\d+/(\d+)/' continue_next_page = False with SeleniumBrowser(self, response) as browser: browser.get(response.url) selector = browser.switch_to_frame(iframe_xpath) next_page_xpath = "//a[@class='next_page']/@href" review_urls = self.extract_list(selector.xpath(review_url_xpath)) for review_url in review_urls: match = re.search(review_re, review_url) if not match: print review_url continue source_internal_id = match.group(1) if not is_product_in_db_by_sii(self.mysql_manager, self.spider_conf["source_id"], source_internal_id): continue_next_page = True review_url = get_full_url(response.url, review_url) request = Request(review_url, callback=self.parse_review) request.meta['source_internal_id'] = source_internal_id yield request if continue_next_page: next_page = self.extract(selector.xpath(next_page_xpath)) next_page = get_full_url(response.url, next_page) if next_page: request = Request(next_page, callback=self.parse) yield request
def parse(self, response): original_url = response.url with SeleniumBrowser(self, response) as browser: wait_for = None wait_type, wait_for_xpath = "wait_none", "" if wait_for_xpath: wait_for = EC.wait_none((By.XPATH, wait_for_xpath)) browser.get(response.url) selector = browser.scroll_until_the_end(2000, wait_for) response = selector.response urls_xpath = "//h3[contains(@class,'entry-title')]/a/@href" params_regex = {} urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: continue single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) yield request
def parse(self, response): original_url = response.url with SeleniumBrowser(self, response) as browser: wait_for = None wait_type, wait_for_xpath = "wait_none", "" if wait_for_xpath: wait_for = EC.wait_none((By.XPATH, wait_for_xpath)) browser.get(response.url) while True: try: selector = browser.click_link( "//a[@class='more-articles']", wait_for) response = selector.response except: break urls_xpath = "//div[@class='article-stream-container']//a[img[@class=' article_wrap']]/@href" params_regex = {} urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: continue single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) yield request
def parse_sub_category(self, response): category_urls = self.extract_list(response.xpath('//li[@class="level3"]/a/@href')) for category_url in category_urls: category_url = get_full_url(response, category_url) with SeleniumBrowser(self, response) as browser: selector = browser.get(category_url+'||reviews*299|reviews*298') for item in self.parse_category(browser, selector): yield item
def parse(self, response): with SeleniumBrowser(self, response) as browser: seletor = browser.get(response.url) product_urls = self.extract_list( seletor.xpath('//div[@class="media-body"]/a/@ng-href')) for product_url in product_urls: yield Request(url=product_url, callback=self.parse_product)
def parse(self, response): if 'browser' in response.meta: browser = response.meta['browser'] for request in self._parse(response, browser): yield request else: with SeleniumBrowser(self, response) as browser: for request in self._parse(response, browser): yield request
def parse(self, response): with SeleniumBrowser(self, response) as browser: browser.get(response.url) keep_going = True while keep_going: selector = browser.scroll_until_the_end(5000) for review_text in selector.xpath('//article[@itemtype="http://schema.org/BlogPosting"]').extract(): review_section = Selector(text=review_text) product = ProductItem() review = ReviewItem() product['OriginalCategoryName'] = "Miscellaneous" review['DBaseCategoryName'] = "PRO" review['TestTitle'] = self.extract(review_section.xpath('//h2[@itemprop="headline"]/a/text()')) review['TestUrl'] = self.extract(review_section.xpath('//h2[@itemprop="headline"]/a/@href')) product['TestUrl'] = review['TestUrl'] review['Author'] = self.extract(review_section.xpath('//span[@itemprop="author"]/a/text()')) if review['TestTitle']: matches = re.search("^(.*?) review", review['TestTitle'], re.IGNORECASE) if matches: review['ProductName'] = matches.group(1) product['ProductName'] = matches.group(1) else: review['ProductName'] = review['TestTitle'] product['ProductName'] = review['TestTitle'] review["TestDateText"] = self.extract(review_section.xpath('//time/@datetime')) review['TestSummary'] = self.extract_all(review_section.xpath('//div[@class="block-excerpt"]/div[@itemprop="articleBody"]/*/text()'), separator=" ") product['PicURL'] = self.extract(review_section.xpath('//div[@class="block-image"]/a/img/@src')) yield product yield review if self.extract(selector.xpath('//div[@id="load-more-posts"]')): #if self.extract(selector.xpath('//div[@id="load-more-posts"]/div')): # print "Current URL: ", self.browser.browser.current_url # self.browser.browser.refresh() #else: pre_click_headline = self.extract(selector.xpath('//article[@itemtype="http://schema.org/BlogPosting" and position()=2]//h2[@itemprop="headline"]/a/text()')) browser.click('//div[@id="load-more-posts"]') post_click_headline = self.extract(selector.xpath('//article[@itemtype="http://schema.org/BlogPosting" and position()=2]//h2[@itemprop="headline"]/a/text()')) if pre_click_headline == post_click_headline: keep_going = False else: keep_going = False
def parse(self, response): #Must use only product_page category_xpaths = { "category_leaf": "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a[last()-1]/text()" } category_path_xpath = "//div[@id='breadcrumb']/a[@class='home']/following-sibling::a/text()" product_xpaths = { "PicURL": "(//*[@property='og:image'])[1]/@content", "source_internal_id": "//form[@id='productSheet']/@data-product", "ProductName": "//div[@itemprop='name']/h1/text()", "ProductManufacturer": "//*[@class='nameBrand']/text()" } category_path_selector = response.xpath(category_path_xpath) category_path_selector = category_path_selector[:-1] category = self.init_item_by_xpaths(response, "category", category_xpaths) category["category_path"] = self.extract_all(category_path_selector, separator=' | ') print category product = self.init_item_by_xpaths(response, "product", product_xpaths) product["OriginalCategoryName"] = category["category_path"] product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "conforama_fr_id" product_id['ID_value'] = product["source_internal_id"] yield product_id yield category yield product yield self.get_rm_kidval(product, response) reviews_xpath = "//a[@id='rating']" with SeleniumBrowser(self, response) as browser: browser.get(response.url) selector = browser.click(reviews_xpath) response.meta['browser'] = browser response.meta['product'] = product response.meta['product_id'] = product_id response.meta['_been_in_decorator'] = True for review in self.parse_reviews(response, selector=selector): yield review
def parse(self, response): cookies = response.meta.get('cookies', None) if not cookies: with SeleniumBrowser(self, response, no_images=False, no_css=False) as browser: cookies = self.login_selenium(browser) cat_url_xpath = "//footer[@class='report-category__footer']/a/@href" cat_urls = self.extract_list_xpath(response, cat_url_xpath) for cat_url in cat_urls: cat_url = get_full_url(response, cat_url) request = Request(cat_url, callback=self.parse_category_leafs) request.meta['cookies'] = cookies yield request
def parse(self, response): #Must use only product_page category_xpaths = { "category_leaf": "//*[@id='moreFrom-catLink']/a/text()", "category_path": "//*[@id='moreFrom-catLink']/a/text()" } product_xpaths = { "PicURL": "(//li[@class='productImageItem'])[1]//img/@src", "ProductName": "//h1[@class='productHeading']//text()", "ProductManufacturer": "//h1[@class='productHeading']/text()" } category = self.init_item_by_xpaths(response, "category", category_xpaths) product = self.init_item_by_xpaths(response, "product", product_xpaths) match = re.search(self.source_internal_id_re, response.url) if match: product['source_internal_id'] = match.group(1) product["OriginalCategoryName"] = category["category_path"] yield category yield product yield self.get_rm_kidval(product, response) mpn_value = self.extract( response.xpath("//span[@id='productMPN']/text()")) if mpn_value: mpn = ProductIdItem() mpn['source_internal_id'] = product["source_internal_id"] mpn['ProductName'] = product["ProductName"] mpn['ID_kind'] = "MPN" mpn['ID_value'] = mpn_value yield mpn ean_value = self.extract( response.xpath("//span[@id='productEAN']/text()")) if ean_value: ean = ProductIdItem() ean['source_internal_id'] = product["source_internal_id"] ean['ProductName'] = product["ProductName"] ean['ID_kind'] = "EAN" ean['ID_value'] = ean_value yield ean with SeleniumBrowser(self, response) as browser: selector = browser.get(response.url) for review in self._parse_reviews(selector, browser, product): yield review
def parse(self, response): selector = Selector(response) next_page_xpath = "//*[@class='more-button']" with SeleniumBrowser(self, response) as browser: browser.get(response.url) while not self.stop_scraping(selector): selector = browser.click(next_page_xpath) review_urls = self.extract_list( selector.xpath('//h3[@class="news-item-title"]/a/@href')) for review_url in review_urls: review_url = get_full_url(response, review_url) request = Request(review_url, callback=self.parse_review) yield request
def parse_product(self, response): category = response.meta['category'] product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = category['category_path'] name = self.extract(response.xpath('//h1/span/text()')) name_match = re.findall(r'[^()]+', name) product['ProductName'] = name_match[0] pic_url = self.extract( response.xpath('//div[@class="main-image"]/img/@src')) if pic_url: product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract( response.xpath('//span[@class="brand"]/text()')) product['source_internal_id'] = self.extract( response.xpath( '//body[@id="MasterPageBodyTag"]/@data-productcode')) yield product mpn = self.extract( response.xpath('//span[contains(text(),"Mfg")]/span/text()')) if mpn: mpn_id = self.product_id(product) mpn_id['ID_kind'] = "MPN" mpn_id['ID_value'] = mpn yield mpn_id product_id = self.product_id(product) product_id['ID_kind'] = "cdw_id" product_id['ID_value'] = product['source_internal_id'] yield product_id with SeleniumBrowser(self, response) as browser: selector = browser.get(response.url, timeout=10) response.meta['browser'] = browser response.meta['product'] = product response.meta['product_id'] = product_id response.meta['_been_in_decorator'] = True for review in self.parse_reviews(response, selector, incremental=True): yield review
def parse(self, response): product_page_xpath = "//li[contains(@class, 'parentTerminal')]//a[contains(@class, 'btn') and not(contains(@href, 'movil/tarjeta'))]/@href" category = CategoryItem() category['category_path'] = "Cell Phones" yield category with SeleniumBrowser(self, response, no_images=False, no_css=False) as browser: selector = browser.get(response.url) product_urls = self.extract_list_xpath(selector, product_page_xpath) for product_url in product_urls: product_url = get_full_url(response.url, product_url) request = self.selenium_request(product_url, callback=self.parse_product) request.meta['category'] = category yield request
def parse(self, response): #Must use only product_page category_xpaths = { "category_leaf": "(//div[@class='breadcrumb']//a/span)[last()]//text()" } category_path_xpath = "(//div[@class='breadcrumb']//a/span)//text()" product_xpaths = { "PicURL": "(//*[@property='og:image'])[1]/@content", "ProductName": "//h1[contains(@class, 'page-title')]/span//text()", "ProductManufacturer": "//h1[contains(@class,'page-title')]/span[@itemprop='brand']/text()" } category = self.init_item_by_xpaths(response, "category", category_xpaths) category["category_path"] = self.extract_all( response.xpath(category_path_xpath), separator=' | ') product = self.init_item_by_xpaths(response, "product", product_xpaths) product["OriginalCategoryName"] = category["category_path"] product["source_internal_id"] = None source_internal_id_xpath = "//meta[@itemprop='identifier']/@content" source_internal_id = self.extract( response.xpath(source_internal_id_xpath)) match = re.match(self.source_internal_id_re, source_internal_id) if match: product["source_internal_id"] = match.group(1) yield category yield product yield self.get_rm_kidval(product, response) reviews_xpath = "//a[contains(text(),' customer reviews')]" with SeleniumBrowser(self, response) as browser: browser.get(response.url) selector = browser.click(reviews_xpath) for review in self._parse_reviews(selector, browser, product): yield review
def parse_product(self, response): color_url_xpath = "//fieldset[contains(@class, 'color-picker')]/ul/li/a/@href" size_id_xpath = "//fieldset[contains(@class, 'size-picker')]//input/@id" single_size_xpath = "//fieldset[contains(@class, 'size-picker')]//input[@id='%s']/following-sibling::label[1]" with SeleniumBrowser(self, response, no_images=False, no_css=False) as browser: selector = browser.get(response.url) color_urls = self.extract_list_xpath(selector, color_url_xpath) for color_url in color_urls: color_url = get_full_url(response.url, color_url) selector = browser.get(color_url) size_ids = self.extract_list_xpath(selector, size_id_xpath) for size_id in size_ids: selector = browser.click(single_size_xpath % size_id) for item in self._parse_product(response, browser, selector): yield item
def parse_category(self, response): all_products_xpath = "//a[@data-selector='INTRO_Link']/@href" all_products_url = self.extract(response.xpath(all_products_xpath)) all_products_url = get_full_url(response.url, all_products_url) if all_products_url: request = self.selenium_request(all_products_url, callback=self.parse_category) request.meta['category'] = response.meta["category"] yield request return with SeleniumBrowser(self, response, no_css=False, no_images=False) as browser: self.login_selenium(browser) products_div_xpath = "//section[contains(@class,'js-listing--desktop')]" #products_div = response.xpath(products_div_xpath) #if products_div: for item in self.parse_product_list_div(response, browser): yield item
def parse_product(self, response): select_xpath = "//*[contains(@class, 'product-stage')]" \ "//select[contains(@class, 'select_to')]" select_values_xpath = "//*[contains(@class, 'product-stage')]" \ "//select[contains(@class, 'select_to')]/option/@value" color_variant_url_xpath = "//li[contains(@class, 'product-stage__colors__color')]/a/@href" color_variants = self.extract_list_xpath(response, color_variant_url_xpath) select = response.xpath(select_xpath) if select or color_variants: with SeleniumBrowser(self, response, no_images=False, no_css=False) as browser: browser.get(response.url) if select: select_values = self.extract_list_xpath( response, select_values_xpath) for value in select_values: if not value: return selector = browser.select_by_value(select_xpath, value) color_variants = self.extract_list_xpath( selector, color_variant_url_xpath) if color_variants: for item in self.parse_color_variants( response, color_variants, browser=browser): yield item else: for item in self._parse_product( response, selector): yield item elif color_variants: for item in self.parse_color_variants( response, color_variants, browser): yield item else: for item in self._parse_product(response): yield item
def parse(self, response): original_url = response.url product = response.meta.get("product", {}) review = response.meta.get("review", {}) with SeleniumBrowser(self, response) as browser: wait_for = None wait_type, wait_for_xpath = "wait_none", "" if wait_for_xpath: wait_for = EC.wait_none((By.XPATH, wait_for_xpath)) browser.get(response.url) selector = browser.scroll_until_the_end(2000, wait_for) response = selector.response urls_xpath = "//div[@class='landing-feed--special-content']/a/@href" params_regex = {} urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: continue single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) try: request.meta["product"] = product except: pass try: request.meta["review"] = review except: pass yield request
def parse(self, response): #Must use only product_page product_xpaths = { "PicURL": "(//*[@property='og:image'])[1]/@content", "ProductName": "//h1/span[@itemprop='name']/text()", "ProductManufacturer": "(//h1/span[@itemprop='name']/text())[1]" } product = self.init_item_by_xpaths(response, "product", product_xpaths) parsed_url = urlparse(response.url) match = re.search(self.source_internal_id_re, parsed_url.path) if match: product["source_internal_id"] = match.group(1) product_id = ProductIdItem() product_id['source_internal_id'] = product["source_internal_id"] product_id['ProductName'] = product["ProductName"] product_id['ID_kind'] = "richersounds_id" product_id['ID_value'] = product["source_internal_id"] yield product_id yield product yield self.get_rm_kidval(product, response) reviews_xpath = "//h4/a[contains(@href,'review')]" with SeleniumBrowser(self, response) as browser: browser.get(response.url) selector = browser.click(reviews_xpath) response.meta['browser'] = browser response.meta['product'] = product response.meta['product_id'] = product_id response.meta['_been_in_decorator'] = True for review in self.parse_reviews(response, selector=selector): yield review
def parse(self, response): #Must use only product_page category_xpaths = { "category_leaf": "(//ul[@id='breadcrumb']/li/a)[last()]/text()" } category_path_xpath = "//ul[@id='breadcrumb']/li/a/text()" product_xpaths = { "PicURL": "//div[@id='productImage']/img/@src", "ProductName": "//div[@typeof='v:Product']/h1/text()", "ProductManufacturer": "//div[@typeof='v:Product']/h1/span[@property='v:brand']/text()" } category = self.init_item_by_xpaths(response, "category", category_xpaths) category["category_path"] = self.extract_all( response.xpath(category_path_xpath), separator=" | ") product = self.init_item_by_xpaths(response, "product", product_xpaths) product["PicURL"] = get_full_url(response, product["PicURL"]) product["OriginalCategoryName"] = category["category_path"] product["ProductName"] = "%s %s" % (product['ProductManufacturer'], product["ProductName"]) yield category yield product yield self.get_rm_kidval(product, response) with SeleniumBrowser(self, response) as browser: browser.get(response.url) selector = browser.click("//a[@class='reviewLinks']") for review in self._parse_reviews(selector, browser, product): yield review
def parse(self, response): category_xpaths = { "category_leaf": "(//a[@class='klickpfad'])[last()]//text()" } category_path_xpath = "//a[@class='klickpfad']/text()" product_xpaths = { "PicURL": "//div[@id='big_handy_img']/img/@src", "ProductName": "//h1/span[@itemprop='name']/text()", "ProductManufacturer": "//h1/span[@itemprop='brand']//text()" } category = self.init_item_by_xpaths(response, "category", category_xpaths) category["category_path"] = self.extract_all( response.xpath(category_path_xpath), separator=' | ') product = self.init_item_by_xpaths(response, "product", product_xpaths) product["OriginalCategoryName"] = category["category_path"] product["ProductName"] = "%s %s" % (product["ProductManufacturer"], product["ProductName"]) yield category yield product yield self.get_rm_kidval(product, response) reviews_xpath = "//a[@id='ekomi_button']" all_reviews_button = "//span[@id='lade_bewertungen']" with SeleniumBrowser(self, response) as browser: browser.get(response.url) browser.click(reviews_xpath) selector = browser.click(all_reviews_button) for review in self._parse_reviews(selector, browser, product): yield review
def parse(self, response): original_url = response.url with SeleniumBrowser(self, response) as browser: wait_for = None wait_type, wait_for_xpath = "wait_none", "" if wait_for_xpath: wait_for = EC.wait_none((By.XPATH, wait_for_xpath)) browser.get(response.url) while True: try: urls_xpath = "//ul[@class='review-listing']//h3/a/@href" params_regex = {} urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: continue single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) yield request selector = browser.click_link( "//ul[contains(@class,'pager')]//a[contains(text(),'next')]", wait_for) response = selector.response except: break
def level_5(self, response): original_url = response.url with SeleniumBrowser(self, response) as browser: browser.get(response.url) first_time = True while True: if not first_time: try: selector = browser.click_link("//button[contains(@class, 'loadmore')]", None) response = selector.response except: break first_time = False containers_xpath = ".//div[@itemprop='review']" containers = response.xpath(containers_xpath) for review_container in containers: review = ReviewItem() review['ProductName'] = self.extract(review_container.xpath("//a[contains(@class, 'breadcrumb-link')]/@title")) review['SourceTestRating'] = self.extract(review_container.xpath(".//meta[@itemprop='ratingValue']/@content")) review['TestDateText'] = self.extract(review_container.xpath(".//span[contains(@class, 'review-date')]//text()")) review['TestSummary'] = self.extract(review_container.xpath(".//p[@itemprop='description']/text()")) review['Author'] = self.extract(review_container.xpath(".//span[contains(@class, 'user-nickname')]/text()")) review['TestTitle'] = self.extract(review_container.xpath(".//h4[@itemprop='name']/text()")) review['TestUrl'] = original_url try: review['ProductName'] = product['ProductName'] review['source_internal_id'] = product['source_internal_id'] except: pass review["DBaseCategoryName"] = "USER" review["SourceTestScale"] = "5" if review["TestDateText"]: review["TestDateText"] = date_format(review["TestDateText"], "%d-%m-%Y", ["en"]) yield review
def __init__(self, *a, **kw): AlaCrawlSpider.__init__(self, *a, **kw) self.browser = SeleniumBrowser(self)
class SnapdealSpider(AlaCrawlSpider): name = 'snapdeal' download_delay = 2 start_urls = ['http://www.snapdeal.com/page/sitemap'] def __init__(self, *a, **kw): AlaCrawlSpider.__init__(self, *a, **kw) self.browser = SeleniumBrowser(self) def process_category_link(value): return value + "?sort=plrty&" rules = [ Rule( LxmlLinkExtractor( unique=True, allow=['/products/mobiles-mobile-phones'], #, # '/products/mobiles-tablets', # '/products/cameras-digital-cameras', # '/products/cameras-digital-slrs' restrict_xpaths=['//*[@class="ht180"]//li//*'], process_value=process_category_link), callback="parse_category") ] def parse_category(self, response): category_path_xpath = '//*[@class="containerBreadcrumb"]//span/text()' category_leaf_xpath = '//*[@class="active-bread"]/text()' clickable_element = '//*[contains(@class,"list-view-lang")]' loading_icon_xpath = '//*[@id="ajax-loader-icon" and @class="mar_20per_left"]' product_list_xpath = '//*[@id="prodDetails"]/@href' category_xpath = self.extract_all(response, category_path_xpath, '|') if category_xpath not in self.skip_categories: category = CategoryItem() category["category_path"] = category_xpath category["category_leaf"] = self.extract(response, category_leaf_xpath) category["category_url"] = response.url yield category wait_for = EC.element_to_be_clickable( (By.XPATH, clickable_element)) selector = self.browser.get(response, wait_for) wait_for = EC.invisibility_of_element_located( (By.XPATH, loading_icon_xpath)) selector = self.browser.scroll_until_the_end(2000, wait_for) products = selector.xpath(product_list_xpath) for product in products: product_url = product.extract() request = Request(product_url, callback=self.parse_product) request.meta['category'] = category yield request def parse_product(self, response): category = response.meta['category'] product_name_xpath = '//*[@class="productTitle"]//*[@itemprop="name"]/text()' brand_xpath = '//*[@itemprop="brand"]//*[@itemprop="name"]/text()' pic_url_xpath = '//*[@class="mainImageSlider"]//*[@itemprop="image"]/@src' source_internal_id_xpath = '//*[@id="pppid"]/text()' product = ProductItem() product["source_internal_id"] = self.extract( response.xpath(source_internal_id_xpath)) product["ProductName"] = self.extract( response.xpath(product_name_xpath)) product["OriginalCategoryName"] = category['category_path'] product["PicURL"] = self.extract(response.xpath(pic_url_xpath)) product["ProductManufacturer"] = self.extract( response.xpath(brand_xpath)) product["TestUrl"] = response.url for review in self.parse_reviews(response, product): yield review yield product def parse_reviews(self, response, product): reviews_xpath = '//*[@class="pr-review-wrap"]' next_page_xpath = '//*[@class="pr-pagination-top"]//*[@class="pr-page-next"]/a' review_elements = response.xpath(reviews_xpath) for review_element in review_elements: yield self.parse_review(review_element, product) next_page = self.extract(response.xpath(next_page_xpath)) if next_page: ec_condition = EC.element_to_be_clickable( (By.XPATH, next_page_xpath)) self.browser.get(response, ec_condition) ec_condition = EC.presence_of_all_elements_located( (By.XPATH, '//*[@class="pr-contents-wrapper"]')) selector = self.browser.click(next_page_xpath, ec_condition) for review in self.parse_reviews(selector, product): yield review def parse_review(self, response, product): author_xpath = './/*[@class="prReviewAuthorProfileLnk"]/span/text()' rating_xpath = './/*[@class="pr-rating pr-rounded"]/text()' title_xpath = './/*[@class="pr-review-rating-headline"]/text()' verdict_xpath = './/*[@class="pr-review-bottom-line-wrapper"]/text()' date_xpath = './/*[contains(@class,"pr-review-author-date")]/text()' summary_xpath = './/*[@class="pr-comments"]' review = ReviewItem() review["source_internal_id"] = product["source_internal_id"] review["ProductName"] = product["ProductName"] review["SourceTestRating"] = self.extract(response.xpath(rating_xpath)) extracted_date = self.extract(response.xpath(date_xpath)) review["TestDateText"] = datetime.strptime( extracted_date, "%d/%m/%Y").strftime('%Y-%m-%d') review["TestSummary"] = self.extract(response.xpath(summary_xpath)) review["TestVerdict"] = self.extract(response.xpath(verdict_xpath)) review["Author"] = self.extract(response.xpath(author_xpath)) review["DBaseCategoryName"] = "USER" review["TestTitle"] = self.extract(response.xpath(title_xpath)) review["TestUrl"] = product["TestUrl"] return review