def parse(self, response): latest_review_date_xpath = "(//span[@class='date'])[1]/text()" latest_review_date_text = self.extract( response.xpath(latest_review_date_xpath)) latest_review_date = datetime.strptime(latest_review_date_text, "%d %B %Y") if latest_review_date and latest_review_date < self.stored_last_date: return next_page_xpath = "//li[@class='pager-next']//a/@href" review_url_xpath = ".//div[@class='teaser-content']//a/@href" img_xpath = ".//img/@src" product_items = response.xpath("//div[@id='content']//article") for item in product_items: review_url = self.extract(item.xpath(review_url_xpath)) review_url = get_full_url(response, review_url) request = Request(review_url, callback=self.parse_review) img = self.extract(item.xpath(img_xpath)) request.meta['PicURL'] = img yield request next_page = self.extract(response.xpath(next_page_xpath)) next_page = get_full_url(response, next_page) request = Request(next_page, callback=self.parse) yield request
def parse_product_list(self, response): category = response.meta.get('category', None) if not category: category_path_xpath = "//ul[@class='breadcrumbs']/li[position() != 1 and position() < last()]/a/text()" category_leaf_xpath = "//ul[@class='breadcrumbs']/li[last()]/text()" category = CategoryItem() category['category_url'] = response.url category['category_leaf'] = self.extract( response.xpath(category_leaf_xpath)) category['category_path'] = self.extract_all(response.xpath(category_path_xpath), separator=' | ') category['category_path'] = '%s | %s' % (category['category_path'], category['category_leaf']) yield category if self.should_skip_category(category): return next_page_url_xpath = "//a[@rel='next']/@href" products_xpath = "//ul[@class='products-list']/li" product_url_xpath = "..//div[@class='product-wrapper']//h2/a/@href" products = response.xpath(products_xpath) for product in products: product_url = self.extract_xpath(product, product_url_xpath) product_url = get_full_url(response, product_url) request = Request(product_url, callback=self.parse_product) request.meta['category'] = category yield request next_page_url = self.extract_xpath(response, next_page_url_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_product_list) request.meta['category'] = category yield request
def parse(self, response): iframe_xpath = "//iframe[@id='mainframe']" review_url_xpath = "//div[@class='title']/a/@href" review_re = 'magazine/\d+/\d+/\d+/(\d+)/' continue_next_page = False with SeleniumBrowser(self, response) as browser: browser.get(response.url) selector = browser.switch_to_frame(iframe_xpath) next_page_xpath = "//a[@class='next_page']/@href" review_urls = self.extract_list(selector.xpath(review_url_xpath)) for review_url in review_urls: match = re.search(review_re, review_url) if not match: print review_url continue source_internal_id = match.group(1) if not is_product_in_db_by_sii(self.mysql_manager, self.spider_conf["source_id"], source_internal_id): continue_next_page = True review_url = get_full_url(response.url, review_url) request = Request(review_url, callback=self.parse_review) request.meta['source_internal_id'] = source_internal_id yield request if continue_next_page: next_page = self.extract(selector.xpath(next_page_xpath)) next_page = get_full_url(response.url, next_page) if next_page: request = Request(next_page, callback=self.parse) yield request
def level_2(self, response): original_url = response.url category = response.meta.get('category', None) if not category: category_path_xpath = "//div[@id='content-wrapper']/div[1]/div[@id='bc']//*[@class!='bc-bullet' and position()>1]/text()" category = CategoryItem() category['category_url'] = original_url category['category_path'] = self.extract_all( response.xpath(category_path_xpath), ' | ') yield category if self.should_skip_category(category): return url_xpath = "//*[@class='actionbar']/following-sibling::div[1]//a[starts-with(.,'Next')]/@href" single_url = self.extract(response.xpath(url_xpath)) if single_url: single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) request.meta['category'] = category yield request urls_xpath = "//h4/a/@href" urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_3) request.meta['category'] = category yield request
def parse(self, response): product_url_xpath = "//tr[@class='prod_list_row']//td[not(@*)]/a/@href" category_url_xpath = "//div[contains(@class, 'list_block_item')]/span/a/@href" product_urls = self.extract_list(response.xpath(product_url_xpath)) category_urls = self.extract_list(response.xpath(category_url_xpath)) if product_urls: # A page containing products, e.g. all cell phones for product_url in product_urls: product_url = get_full_url(response, product_url) request = Request(product_url, callback=self.parse_product) yield request next_page_xpath = "//span[@class='next']/a/@href" next_page = self.extract(response.xpath(next_page_xpath)) if next_page: next_page = get_full_url(response, next_page) request = Request(next_page, callback=self.parse) yield request elif category_urls: # A page containing all sub-categories of a type of product for category_url in category_urls: category_url = get_full_url(response, category_url) request = Request(category_url, callback=self.parse) yield request else: # Failed request = self._retry(response.request) yield request return
def parse(self, response): category_urls = self.extract_list(response.xpath('//table[@class="menuColumns"]//li/a/@href')) for category_url in category_urls: category_url = get_full_url(response, category_url) yield Request(url=category_url, callback=self.parse) if category_urls: return category = None if "category" in response.meta: category = response.meta['category'] if not category: category = CategoryItem() category['category_path'] = self.extract_all(response.xpath('//div[@class="breadcrumb"]//text()')) category['category_leaf'] = self.extract(response.xpath('//h1/text()')) category['category_url'] = response.url yield category if not self.should_skip_category(category): product_urls = self.extract_list(response.xpath('//div[@class="product"]/div[@class="title"]/a/@href')) for product_url in product_urls: product_url = get_full_url(response, product_url) request = Request(url=product_url, callback=self.parse_product) request.meta['category'] = category yield request next_page = self.extract(response.xpath('//a[@rel="next"]/@href')) if next_page: request = Request(url=get_full_url(response, next_page), callback=self.parse) request.meta['category'] = category yield request
def parse(self, response): original_url = response.url url_xpath = "//a[@class='next page-numbers']/@href" single_url = self.extract(response.xpath(url_xpath)) if single_url: single_url = get_full_url(original_url, single_url) matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) request = Request(single_url, callback=self.parse) yield request containers_xpath = "//article" url_xpath = ".//header/h2/a/@href" params_regex = {} containers = response.xpath(containers_xpath) for container in containers: single_url = self.extract(container.xpath(url_xpath)) single_url = get_full_url(response, single_url) request = Request(single_url, callback=self.level_2) extract_text = self.extract(container.xpath('.//small//p/a[not(contains(text(), "Reviews"))][1]//text()')) matches = None if extract_text: matches = re.search(params_regex["OriginalCategoryName"], extract_text, re.IGNORECASE) text_1 = "" if matches: text_1 = matches.group(1) request.meta["OriginalCategoryName"] = text_1 yield request
def level_3(self, response): original_url = response.url url_xpath = "//div[contains(@class, 'bottom')]//a[@data-selenium='pn-next']/@href" single_url = self.extract(response.xpath(url_xpath)) if single_url: single_url = get_full_url(original_url, single_url) matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) request = Request(single_url, callback=self.level_3) yield request urls_xpath = "//div[@data-selenium='itemInfo-zone']//a[@data-selenium='itemHeadingLink']/@href" urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: single_url = get_full_url(original_url, single_url) matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) request = Request(single_url, callback=self.level_4) yield request
def parse(self, response): next_page_xpath = "//div[@class='pager'][last()]//span[contains(text(),'Avanti')]/parent::a/@href" category_url_xpath = "//div[@class='contentWrapper']//a" categories = response.xpath(category_url_xpath) for category_sel in categories: category_name = self.extract(category_sel.xpath("./text()")) category_url = self.extract(category_sel.xpath("./@href")) category_url = get_full_url(response, category_url) category = CategoryItem() category['category_leaf'] = category_name category['category_path'] = category_name category['category_url'] = category_url yield category if not self.should_skip_category(category): request = self.selenium_request(category_url, callback=self.parse_category) request.meta['category'] = category yield request next_page_url = response.xpath(next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse) yield request
def parse_category(self, response): category = response.meta['category'] level = response.meta.get('level', 0) if 'aviation' in category['category_url']: return if 'gaming' in category['category_url'] and level == 0: subcat_url_xpath = "//a[@class='category-navigation__list__item__link']/@href" subcat_urls = self.extract_list_xpath(response, subcat_url_xpath) for subcat_url in subcat_urls: subcat_url = get_full_url(response, subcat_url) request = Request(subcat_url, callback=self.parse_category) request.meta['category'] = category request.meta['level'] = level + 1 yield request for request in self.extract_product_requests(response): yield request next_page_xpath = "(//*[@rel='next'])[1]/@href" next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) #request = Request(next_page_url, callback=self.parse_cat_javascript, # headers={'Accept':'application/javascript'}) request = Request(next_page_url, callback=self.parse_category, meta=response.meta) request.meta['category'] = category yield request
def parse(self, response): review_divs_xpath = "//div[@id='content']/div[@id='block-system-main']" review_divs = response.xpath(review_divs_xpath) for review_div in review_divs: date_xpath = ".//span[@class='date-display-single']/text()" dates = (review_div.xpath(date_xpath)).getall() for date in dates: review_date = datetime.strptime(date, '%d %b %Y') if review_date > self.stored_last_date: review_urls_xpath = ".//p//a/@href" review_urls = ( review_div.xpath(review_urls_xpath)).getall() for review_url in review_urls: review_url = get_full_url(response, review_url) yield Request(url=review_url, callback=self.parse_items) next_page_xpath = "//a[@title='Go to next page']/@href" next_page = self.extract(response.xpath(next_page_xpath)) next_page_url = get_full_url(response, next_page) review_date_xpath = "(//div[@id='block-system-main']//span[@class='date-display-single']/text())[last()]" review_date = self.extract(response.xpath(review_date_xpath)) oldest_review_date = datetime.strptime(review_date, "%d %b %Y") if next_page: if oldest_review_date > self.stored_last_date: yield response.follow(next_page_url, callback=self.parse)
def parse_category(self, response): products_xpath = "//div[@class='product_info']" product_url_xpath = "./div[@class='product_name']/a/@href" has_review_xpath = ".//div[@class='product_rating']/span[@class!='noRating']" next_page_xpath = "(//*[@rel='next'])[1]/@href" products = response.xpath(products_xpath) category = response.meta['category'] category_name = category['category_path'] # Not a leaf category page if not products: return for product in products: has_review = product.xpath(has_review_xpath) if not has_review: continue product_url = self.extract(product.xpath(product_url_xpath)) product_url = get_full_url(response, product_url) request = Request(product_url, callback=self.parse_product) request.meta['OriginalCategoryName'] = category_name yield request next_page_url = self.extract_xpath(response, next_page_xpath) if next_page_url: next_page_url = get_full_url(response, next_page_url) next_page_request = Request(next_page_url, callback=self.parse_category) next_page_request.meta['category'] = category yield next_page_request
def parse_category(self, response): products_xpath = "//div[@data-component='product-list-view']/article/div[@class='desc']" next_page_xpath = "//a[@class='next']/@href" product_url_xpath = "./a/@href" has_review_xpath = ".//*[contains(@class, 'reevoo-score')]" products = response.xpath(products_xpath) if not products: return # This category may be too general, but it helps if we know it can be skipped category_json_ld = extruct_helper.extract_json_ld(response.body, 'BreadcrumbList') if category_json_ld: category = extruct_helper.category_item_from_breadcrumbs_json_ld(category_json_ld) yield category if self.should_skip_category(category): return for product in products: has_review = product.xpath(has_review_xpath) if not has_review: continue product_url = self.extract(product.xpath(product_url_xpath)) request = Request(url=get_full_url(response, product_url), callback=self.parse_product) yield request next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_category) yield request
def parse(self, response): review_divs_xpath = "//div[@id='content']" review_divs = response.xpath(review_divs_xpath) for review_div in review_divs: date_xpath = ".//span[@class='posted-on']/text()" dates = (review_div.xpath(date_xpath)).getall() for date in dates: date = str(date).replace(" at ", " ") review_date = dateparser.parse(date) if review_date: if review_date > self.stored_last_date: review_urls_xpath = ".//h2[@class='title']/a[@class='journal-entry-navigation-current']/@href" review_urls = ( review_div.xpath(review_urls_xpath)).getall() for review_url in review_urls: review_url = get_full_url(response, review_url) yield Request(url=review_url, callback=self.parse_items) next_page_xpath = "//span[@class='paginationControlNextPageSuffix']/a/@href" next_page = self.extract(response.xpath(next_page_xpath)) next_page_url = get_full_url(response, next_page) review_date_xpath = "(//span[@class='posted-on']/text())[last()]" review_date = self.extract(response.xpath(review_date_xpath)) date = str(review_date).replace(" at ", " ") oldest_review_date = dateparser.parse(date) if next_page: if oldest_review_date > self.stored_last_date: yield response.follow(next_page_url, callback=self.parse)
def parse_category(self, response): next_page_xpath = "(//*[@rel='next'])[1]/@href" category = None if "category" in response.meta: category = response.meta['category'] if not category: category = CategoryItem() category['category_path'] = self.extract_all(response.xpath('//ul[@id="breadcrumb"]//text()'), " > ") category['category_leaf'] = self.extract(response.xpath('//h1/text()')) category['category_url'] = response.url yield category if not self.should_skip_category(category): review_urls_xpath = '//div[@class="reviewsContainer"]/a[@id="ratingLink"]/@href' review_urls = self.extract_list(response.xpath(review_urls_xpath)) for review_url in review_urls: review_url = get_full_url(response, review_url).strip('#reviewsTab') request = Request(url=review_url, callback=self.parse_product) request.meta['category'] = category yield request next_page_url = self.extract_all(response.xpath(next_page_xpath)) if next_page_url: next_page_request = Request(url=get_full_url(response, next_page_url), callback=self.parse_category) next_page_request.meta['category'] = category yield next_page_request
def parse_category(self, response): reviewed_products = self.extract_list(response.xpath('//div[@id="products"]//a[@class="reviews"]/@href')) if reviewed_products: category = None if "category" in response.meta: category = response.meta['category'] if not category: category = CategoryItem() category['category_path'] = self.extract_all(response.xpath('//ul[@id="headerCrumb"]//a/text()'), " ; ") category['category_leaf'] = self.extract(response.xpath('//ul[@id="headerCrumb"]/li[last()]/a/text()')) category['category_url'] = response.url yield category if not self.should_skip_category(category): for product in reviewed_products: product = product.strip('#customer_reviews') request = Request(get_full_url(response, product), callback=self.parse_product) request.meta['category'] = category yield request next_page = self.extract(response.xpath('//a[@title="Next"]/@href')) if next_page: request = Request(get_full_url(response, next_page), callback=self.parse) request.meta['category'] = category yield request
def parse_product(self, response): reviews = response.xpath('//section[article[contains(@class,"review")]]') if reviews: product = ProductItem() product['TestUrl'] = response.url product['OriginalCategoryName'] = 'Cell Phones' product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content')) pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content')) product['PicURL'] = get_full_url(response, pic_url) product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content')) yield product user_reviews = reviews.xpath('./article[@itemprop="review"]') for review in user_reviews: user_review = ReviewItem() user_review['DBaseCategoryName'] = "USER" user_review['ProductName'] = product['ProductName'] user_review['TestUrl'] = product['TestUrl'] date = self.extract(review.xpath('.//span[@class="time"]/text()')) user_review['TestDateText'] = date_format(date, '') user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content')) user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()')) user_review['TestPros'] = self.extract_all(review.xpath( './/div[contains(@class,"positives")]/text()'), '; ') user_review['TestCons'] = self.extract_all(review.xpath( './/div[contains(@class,"negatives")]/text()'), '; ') yield user_review pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href')) if pro_review_url: request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review) request.meta['product'] = product yield request
def parse_sub_category(self, response): products_xpath = "//ul[@id='product-offer-list']/li[contains(@class, 'list-item')]" product_url_xpath = ".//h4[@class='item-name']/a/@href" has_reviews_xpath = './/div[@class="rating-in-words"]/a/@href' category = response.meta.get('category', None) if not category: category = CategoryItem() category['category_path'] = self.extract_all( response.xpath('//ol[@class="breadcrumb"]//span/text()'), " > ") category['category_leaf'] = self.extract(response.xpath('//h1/text()')) category['category_url'] = response.url yield category if not self.should_skip_category(category): products = response.xpath(products_xpath) for product in products: has_reviews = product.xpath(has_reviews_xpath) if not has_reviews: continue product_url = self.extract(product.xpath(product_url_xpath)) if product_url: product_url = get_full_url(response, product_url) request = Request(url=product_url, callback=self.parse_product) request.meta['category'] = category yield request next_page_url = self.extract(response.xpath('//a[@class="next_page"]/@href')) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(url=next_page_url, callback=self.parse_sub_category) request.meta['category'] = category yield request
def level_3(self, response): original_url = response.url url_xpath = "//a[@class='gspr next']/@href" single_url = self.extract(response.xpath(url_xpath)) if single_url: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_3) yield request urls_xpath = "//div[@class='mimg itmcd img']//a[@class='vip']/@href" urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_4) yield request
def parse_category(self, response): next_page_xpath = "//*[@rel='next']/@href" sub_category_xpath = "//div[@id='subCategorycategories']//a/@href" sub_cat_urls = self.extract_list(response.xpath(sub_category_xpath)) if sub_cat_urls: for sub_cat_url in sub_cat_urls: sub_cat_url = get_full_url(response, sub_cat_url) request = Request(sub_cat_url, callback=self.parse_category) yield request else: product_url_xpath = "//div[@class='description']/a/@href" product_urls = self.extract_list(response.xpath(product_url_xpath)) for product_url in product_urls: product_url = get_full_url(response, product_url) request = Request(product_url, callback=self.parse_product) yield request next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_category) yield request
def parse_category(self, response): latest_review_date_xpath = '//div[contains(@class, '\ '"field-name-field-published-date")]//text()' next_page_xpath = '//a[@title="Go to next page"]/@href' review_url_xpath = '//div[@id="content"]//p[@class="title"]'\ '/span/a/@href' review_urls = self.extract_list(response.xpath(review_url_xpath)) for review_url in review_urls: review_url = get_full_url(response, review_url) request = Request(review_url, callback=self.parse_review) yield request # incremental scraping latest_review_date_text = self.extract_xpath(response, latest_review_date_xpath) latest_review_date = dateparser.parse(latest_review_date_text) if latest_review_date and latest_review_date < self.stored_last_date: return next_page = self.extract(response.xpath(next_page_xpath)) if next_page: next_page = get_full_url(response, next_page) request = Request(next_page, callback=self.parse_category) yield request
def parse(self, response): category_name_xpath = '(//h1[1])/text()' products_xpath = "//ul[@class='products']/li" next_page_xpath = "(//*[@rel='next'])[1]/@href" product_url_xpath = "./a/@href" has_review_xpath = ".//ul[contains(@title, 'Average Rating')]" category = response.meta.get('category', '') if not category: category = CategoryItem() category['category_url'] = response.url category['category_leaf'] = self.extract( response.xpath(category_name_xpath)) category['category_path'] = category['category_leaf'] yield category products = response.xpath(products_xpath) for product in products: has_reviews = self.extract(product.xpath(has_review_xpath)) if has_reviews: product_url = self.extract(product.xpath(product_url_xpath)) product_url = get_full_url(response, product_url) request = Request(product_url, callback=self.parse_product) request.meta['category'] = category yield request next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse) request.meta['category'] = category yield request
def parse_category(self, response): category = None if 'category' in response.meta: category = response.meta['category'] if not category: category = CategoryItem() category['category_path'] = self.extract_all(response.xpath('//div[contains(@class,"localizer")]/*/text()')) category['category_leaf'] = self.extract(response.xpath( '//div[contains(@class,"localizer")]/span[last()]/text()')) category['category_url'] = response.url yield category if not self.should_skip_category(category): product_urls = self.extract_list(response.xpath( '//div[contains(@class,"ckPoints")]/ancestor::div[@class="infoWrapper"]' '//p[@class="title"]/a/@href')) for product_url in product_urls: product_url = get_full_url(response, product_url)+'/1/data_dodania/malejaco' request = Request(url=product_url, callback=self.parse_product) request.meta['category'] = category yield request next_page_urls = self.extract_list(response.xpath('//a[contains(@class,"right")]/@href')) if next_page_urls: next_page_url = next_page_urls[0] next_page_url = get_full_url(response, next_page_url) request = Request(url=next_page_url, callback=self.parse_category) request.meta['category'] = category yield request
def parse(self, response): original_url = response.url url_xpath = "//a[img[@alt='vor']]/@href" single_url = self.extract(response.xpath(url_xpath)) if single_url: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.parse) yield request urls_xpath = "//li[contains(@class, 'ttboxpad')]//a[contains(@class, 'extra')]/@href" urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) yield request
def level_2(self, response): original_url = response.url product_xpaths = { "source_internal_id": "substring-before(substring-after(//body/@class,'postid-'),' ')", "ProductName": "//h1//text()", "OriginalCategoryName": "//meta[@property='article:section']/@content", "PicURL": "//meta[@property='og:image']/@content", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['TestUrl'] = original_url picurl = product.get("PicURL", "") if picurl and picurl[:2] == "//": product["PicURL"] = "https:" + product["PicURL"] if picurl and picurl[:1] == "/": product["PicURL"] = get_full_url(original_url, picurl) manuf = product.get("ProductManufacturer", "") if manuf == "" and ""[:2] != "//": product["ProductManufacturer"] = "" try: product["OriginalCategoryName"] = category['category_path'] except: pass ocn = product.get("OriginalCategoryName", "") if ocn == "" and "//meta[@property='article:section']/@content"[: 2] != "//": product[ "OriginalCategoryName"] = "//meta[@property='article:section']/@content" review_xpaths = { "source_internal_id": "substring-before(substring-after(//body/@class,'postid-'),' ')", "ProductName": "//h1//text()", "TestDateText": "substring-before(//meta[contains(@property,'published_time')]/@content,'T')", "TestSummary": "//meta[@property='og:description']/@content", "Author": "//div[@class='single-info']/a//text()", "TestTitle": "//h1//text()", } review = self.init_item_by_xpaths(response, "review", review_xpaths) review['TestUrl'] = original_url try: review['ProductName'] = product['ProductName'] review['source_internal_id'] = product['source_internal_id'] except: pass awpic_link = review.get("AwardPic", "") if awpic_link and awpic_link[:2] == "//": review["AwardPic"] = "https:" + review["AwardPic"] if awpic_link and awpic_link[:1] == "/": review["AwardPic"] = get_full_url(original_url, awpic_link) review["DBaseCategoryName"] = "PRO" yield product yield review
def parse(self, response): original_url = response.url product = response.meta.get("product", {}) review = response.meta.get("review", {}) url_xpath = "(//p[@class='pager']/span[@class='pages']/following::a)[1]/@href" single_url = self.extract(response.xpath(url_xpath)) single_url = single_url.replace('\n', '') single_url = single_url.replace('\t', '') if single_url: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: return single_url = get_full_url(original_url, single_url) print '=' * 30 print single_url request = Request(single_url, callback=self.parse) try: request.meta["product"] = product except: pass try: request.meta["review"] = review except: pass yield request urls_xpath = "//ul[@class='reset']/li/div[@class='spc']/h2/a/@href" params_regex = {} urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: continue single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) try: request.meta["product"] = product except: pass try: request.meta["review"] = review except: pass yield request
def level_2(self, response): original_url = response.url product_xpaths = { "ProductName": "//h1[@itemprop='name']/text()", "OriginalCategoryName": "game", "PicURL": "//meta[@property='og:image']/@content", } product = self.init_item_by_xpaths(response, "product", product_xpaths) product['TestUrl'] = original_url picurl = product.get("PicURL", "") if picurl and picurl[:2] == "//": product["PicURL"] = "https:" + product["PicURL"] if picurl and picurl[:1] == "/": product["PicURL"] = get_full_url(original_url, picurl) manuf = product.get("ProductManufacturer", "") if manuf == "" and ""[:2] != "//": product["ProductManufacturer"] = "" try: product["OriginalCategoryName"] = category['category_path'] except: pass ocn = product.get("OriginalCategoryName", "") if ocn == "" and "game"[:2] != "//": product["OriginalCategoryName"] = "game" review_xpaths = { "ProductName": "//h1[@itemprop='name']/text()", "TestPros": "//div[h2[contains(text(),'Pros')]]/following-sibling::div//li/text()", "TestCons": "//div[h2[contains(text(),'Cons')]]/following-sibling::div//li/text()", "TestSummary": "//div[@class='post_content']/p[contains(.,' ')][not(img)][1]//text()", "Author": "//a[@itemprop='author']/text()", "TestTitle": "//h1[@itemprop='name']/text()", } review = self.init_item_by_xpaths(response, "review", review_xpaths) review['TestUrl'] = original_url try: review['ProductName'] = product['ProductName'] review['source_internal_id'] = product['source_internal_id'] except: pass awpic_link = review.get("AwardPic", "") if awpic_link and awpic_link[:2] == "//": review["AwardPic"] = "https:" + review["AwardPic"] if awpic_link and awpic_link[:1] == "/": review["AwardPic"] = get_full_url(original_url, awpic_link) review["DBaseCategoryName"] = "PRO" yield product yield review
def parse_category(self, response): is_category_xpath = "//*[@class='productList']" has_sub_cat_xpath = "//*[@class='Department']" category_path_xpath = "//ul[@class='breadcrumbList']/li[position() < last()]/a//text()" category_leaf_xpath = "//ul[@class='breadcrumbList']/li[last()]/a//text()" if not response.xpath(is_category_xpath): return if not response.xpath(has_sub_cat_xpath): next_page_xpath = "(//*[@rel='next'])[1]/@href" category = response.meta.get('category', None) if not category: category = CategoryItem() category['category_url'] = response.url category['category_leaf'] = self.extract( response.xpath(category_leaf_xpath)) category['category_path'] = self.extract_all( response.xpath(category_path_xpath), separator=' | ') category['category_path'] = '%s | %s' % ( category['category_path'], category['category_leaf']) yield category if self.should_skip_category(category): return products_xpath = "//*[@class='productList']/li/div[@class='productInfo']" product_url_xpath = "./a[@class='productTitle'][1]/@href" product_rating_xpath = "./div[@class='bvRollup']" products = response.xpath(products_xpath) for product in products: has_reviews = self.extract(product.xpath(product_rating_xpath)) if has_reviews: product_url = self.extract( product.xpath(product_url_xpath)) product_url = get_full_url(response, product_url) request = Request(product_url, callback=self.parse_product) request.meta['category'] = category yield request next_page_url = self.extract(response.xpath(next_page_xpath)) if next_page_url: next_page_url = get_full_url(response, next_page_url) request = Request(next_page_url, callback=self.parse_category) request.meta['category'] = category yield request else: subcat_url_xpath = "(//*[@class='Department']/following::ul[1])/li[not(contains(@class, 'hidden'))]/a/@href" subcat_urls = self.extract_list(response.xpath(subcat_url_xpath)) for subcat_url in subcat_urls: subcat_url = get_full_url(response, subcat_url) request = Request(subcat_url, callback=self.parse_category) yield request
def parse(self, response): original_url = response.url product = response.meta.get("product", {}) review = response.meta.get("review", {}) url_xpath = "//span[contains(.,'ta sida')]/../@href" single_url = self.extract(response.xpath(url_xpath)) single_url = '/bloggen' + single_url if single_url: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: return single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.parse) try: request.meta["product"] = product except: pass try: request.meta["review"] = review except: pass yield request urls_xpath = "//div[@class='blogg_big_container'][contains(.,'Test')]//a[contains(@href,'article')]/@href" params_regex = {} urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: continue single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) try: request.meta["product"] = product except: pass try: request.meta["review"] = review except: pass yield request
def parse(self, response): original_url = response.url product = response.meta.get("product", {}) review = response.meta.get("review", {}) url_xpath = u"//div[@class='navigation']/div[contains(@class,'left')]//a/@href" single_url = self.extract(response.xpath(url_xpath)) if single_url: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: return single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.parse) try: request.meta["product"] = product except: pass try: request.meta["review"] = review except: pass yield request urls_xpath = u"//section//article//a/@href" params_regex = {} urls = self.extract_list(response.xpath(urls_xpath)) for single_url in urls: matches = None if "": matches = re.search("", single_url, re.IGNORECASE) if matches: single_url = matches.group(0) else: continue single_url = get_full_url(original_url, single_url) request = Request(single_url, callback=self.level_2) try: request.meta["product"] = product except: pass try: request.meta["review"] = review except: pass yield request