Ejemplo n.º 1
0
    def parse(self, response):
        latest_review_date_xpath = "(//span[@class='date'])[1]/text()"
        latest_review_date_text = self.extract(
            response.xpath(latest_review_date_xpath))
        latest_review_date = datetime.strptime(latest_review_date_text,
                                               "%d %B %Y")
        if latest_review_date and latest_review_date < self.stored_last_date:
            return

        next_page_xpath = "//li[@class='pager-next']//a/@href"
        review_url_xpath = ".//div[@class='teaser-content']//a/@href"
        img_xpath = ".//img/@src"
        product_items = response.xpath("//div[@id='content']//article")

        for item in product_items:
            review_url = self.extract(item.xpath(review_url_xpath))
            review_url = get_full_url(response, review_url)
            request = Request(review_url, callback=self.parse_review)
            img = self.extract(item.xpath(img_xpath))
            request.meta['PicURL'] = img
            yield request

        next_page = self.extract(response.xpath(next_page_xpath))
        next_page = get_full_url(response, next_page)
        request = Request(next_page, callback=self.parse)
        yield request
Ejemplo n.º 2
0
    def parse_product_list(self, response):
        category = response.meta.get('category', None)
        if not category:
            category_path_xpath = "//ul[@class='breadcrumbs']/li[position() != 1 and position() < last()]/a/text()"
            category_leaf_xpath = "//ul[@class='breadcrumbs']/li[last()]/text()"
            category = CategoryItem()
            category['category_url'] = response.url
            category['category_leaf'] = self.extract(
                    response.xpath(category_leaf_xpath))
            category['category_path'] = self.extract_all(response.xpath(category_path_xpath), separator=' | ')
            category['category_path'] = '%s | %s' % (category['category_path'],
                                                 category['category_leaf'])
            yield category

        if self.should_skip_category(category):
            return

        next_page_url_xpath = "//a[@rel='next']/@href"
        products_xpath = "//ul[@class='products-list']/li"
        product_url_xpath = "..//div[@class='product-wrapper']//h2/a/@href"
        products = response.xpath(products_xpath)
        for product in products:
            product_url = self.extract_xpath(product, product_url_xpath)
            product_url = get_full_url(response, product_url)
            request = Request(product_url, callback=self.parse_product)
            request.meta['category'] = category
            yield request

        next_page_url = self.extract_xpath(response, next_page_url_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_product_list)
            request.meta['category'] = category
            yield request
Ejemplo n.º 3
0
    def parse(self, response):
        iframe_xpath = "//iframe[@id='mainframe']"
        review_url_xpath = "//div[@class='title']/a/@href"
        review_re = 'magazine/\d+/\d+/\d+/(\d+)/'
        continue_next_page = False
        with SeleniumBrowser(self, response) as browser:
            browser.get(response.url)
            selector = browser.switch_to_frame(iframe_xpath)

            next_page_xpath = "//a[@class='next_page']/@href"
            review_urls = self.extract_list(selector.xpath(review_url_xpath))

            for review_url in review_urls:
                match = re.search(review_re, review_url)
                if not match:
                    print review_url
                    continue
                source_internal_id = match.group(1)
                if not is_product_in_db_by_sii(self.mysql_manager,
                                               self.spider_conf["source_id"],
                                               source_internal_id):
                    continue_next_page = True
                    review_url = get_full_url(response.url, review_url)
                    request = Request(review_url, callback=self.parse_review)
                    request.meta['source_internal_id'] = source_internal_id
                    yield request

            if continue_next_page:
                next_page = self.extract(selector.xpath(next_page_xpath))
                next_page = get_full_url(response.url, next_page)
                if next_page:
                    request = Request(next_page, callback=self.parse)
                    yield request
Ejemplo n.º 4
0
    def level_2(self, response):
        original_url = response.url

        category = response.meta.get('category', None)
        if not category:
            category_path_xpath = "//div[@id='content-wrapper']/div[1]/div[@id='bc']//*[@class!='bc-bullet' and position()>1]/text()"
            category = CategoryItem()
            category['category_url'] = original_url
            category['category_path'] = self.extract_all(
                response.xpath(category_path_xpath), ' | ')
            yield category

        if self.should_skip_category(category):
            return

        url_xpath = "//*[@class='actionbar']/following-sibling::div[1]//a[starts-with(.,'Next')]/@href"
        single_url = self.extract(response.xpath(url_xpath))
        if single_url:
            single_url = get_full_url(original_url, single_url)
            request = Request(single_url, callback=self.level_2)
            request.meta['category'] = category
            yield request

        urls_xpath = "//h4/a/@href"
        urls = self.extract_list(response.xpath(urls_xpath))
        for single_url in urls:
            single_url = get_full_url(original_url, single_url)
            request = Request(single_url, callback=self.level_3)
            request.meta['category'] = category
            yield request
Ejemplo n.º 5
0
    def parse(self, response):
        product_url_xpath = "//tr[@class='prod_list_row']//td[not(@*)]/a/@href"
        category_url_xpath = "//div[contains(@class, 'list_block_item')]/span/a/@href"
        product_urls = self.extract_list(response.xpath(product_url_xpath))
        category_urls = self.extract_list(response.xpath(category_url_xpath))

        if product_urls:
            # A page containing products, e.g. all cell phones
            for product_url in product_urls:
                product_url = get_full_url(response, product_url)
                request = Request(product_url, callback=self.parse_product)
                yield request

            next_page_xpath = "//span[@class='next']/a/@href"
            next_page = self.extract(response.xpath(next_page_xpath))
            if next_page:
                next_page = get_full_url(response, next_page)
                request = Request(next_page, callback=self.parse)
                yield request
        elif category_urls:
            # A page containing all sub-categories of a type of product
            for category_url in category_urls:
                category_url = get_full_url(response, category_url)
                request = Request(category_url, callback=self.parse)
                yield request
        else:
            # Failed
            request = self._retry(response.request)
            yield request
            return
Ejemplo n.º 6
0
    def parse(self, response):
        category_urls = self.extract_list(response.xpath('//table[@class="menuColumns"]//li/a/@href'))

        for category_url in category_urls:
            category_url = get_full_url(response, category_url)
            yield Request(url=category_url, callback=self.parse)

        if category_urls:
            return

        category = None

        if "category" in response.meta:
            category = response.meta['category']

        if not category:
            category = CategoryItem()
            category['category_path'] = self.extract_all(response.xpath('//div[@class="breadcrumb"]//text()'))
            category['category_leaf'] = self.extract(response.xpath('//h1/text()'))
            category['category_url'] = response.url
            yield category

        if not self.should_skip_category(category):
            product_urls = self.extract_list(response.xpath('//div[@class="product"]/div[@class="title"]/a/@href'))
            for product_url in product_urls:
                product_url = get_full_url(response, product_url)
                request = Request(url=product_url, callback=self.parse_product)
                request.meta['category'] = category
                yield request

            next_page = self.extract(response.xpath('//a[@rel="next"]/@href'))
            if next_page:
                request = Request(url=get_full_url(response, next_page), callback=self.parse)
                request.meta['category'] = category
                yield request
Ejemplo n.º 7
0
 def parse(self, response):
                                  
     original_url = response.url
     url_xpath = "//a[@class='next page-numbers']/@href"
     single_url = self.extract(response.xpath(url_xpath))
     if single_url:
         single_url = get_full_url(original_url, single_url)
         matches = None
         if "":
             matches = re.search("", single_url, re.IGNORECASE)
         if matches:
             single_url = matches.group(0)
         
         request = Request(single_url, callback=self.parse)
          
         yield request
     containers_xpath = "//article"
     url_xpath = ".//header/h2/a/@href"
     params_regex = {}
     containers = response.xpath(containers_xpath)
     for container in containers:
         single_url = self.extract(container.xpath(url_xpath))
         single_url = get_full_url(response, single_url)
         request = Request(single_url, callback=self.level_2)
         
         extract_text = self.extract(container.xpath('.//small//p/a[not(contains(text(), "Reviews"))][1]//text()'))
         matches = None
         if extract_text:
             matches = re.search(params_regex["OriginalCategoryName"], extract_text, re.IGNORECASE)
         text_1 = ""
         if matches:
             text_1 = matches.group(1)
         request.meta["OriginalCategoryName"] = text_1
         
         yield request
Ejemplo n.º 8
0
    def level_3(self, response):

        original_url = response.url
        url_xpath = "//div[contains(@class, 'bottom')]//a[@data-selenium='pn-next']/@href"
        single_url = self.extract(response.xpath(url_xpath))
        if single_url:
            single_url = get_full_url(original_url, single_url)
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
            if matches:
                single_url = matches.group(0)

            request = Request(single_url, callback=self.level_3)

            yield request
        urls_xpath = "//div[@data-selenium='itemInfo-zone']//a[@data-selenium='itemHeadingLink']/@href"
        urls = self.extract_list(response.xpath(urls_xpath))
        for single_url in urls:
            single_url = get_full_url(original_url, single_url)
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
            if matches:
                single_url = matches.group(0)

            request = Request(single_url, callback=self.level_4)

            yield request
Ejemplo n.º 9
0
    def parse(self, response):
        next_page_xpath = "//div[@class='pager'][last()]//span[contains(text(),'Avanti')]/parent::a/@href"
        category_url_xpath = "//div[@class='contentWrapper']//a"

        categories = response.xpath(category_url_xpath)
        for category_sel in categories:
            category_name = self.extract(category_sel.xpath("./text()"))
            category_url = self.extract(category_sel.xpath("./@href"))
            category_url = get_full_url(response, category_url)

            category = CategoryItem()
            category['category_leaf'] = category_name
            category['category_path'] = category_name
            category['category_url'] = category_url
            yield category

            if not self.should_skip_category(category):
                request = self.selenium_request(category_url,
                                                callback=self.parse_category)
                request.meta['category'] = category
                yield request

        next_page_url = response.xpath(next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse)
            yield request
Ejemplo n.º 10
0
    def parse_category(self, response):
        category = response.meta['category']
        level = response.meta.get('level', 0)
        if 'aviation' in category['category_url']:
            return

        if 'gaming' in category['category_url'] and level == 0:
            subcat_url_xpath = "//a[@class='category-navigation__list__item__link']/@href"
            subcat_urls = self.extract_list_xpath(response, subcat_url_xpath)
            for subcat_url in subcat_urls:
                subcat_url = get_full_url(response, subcat_url)
                request = Request(subcat_url, callback=self.parse_category)
                request.meta['category'] = category
                request.meta['level'] = level + 1
                yield request

        for request in self.extract_product_requests(response):
            yield request

        next_page_xpath = "(//*[@rel='next'])[1]/@href"
        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            #request = Request(next_page_url, callback=self.parse_cat_javascript,
            #                  headers={'Accept':'application/javascript'})
            request = Request(next_page_url,
                              callback=self.parse_category,
                              meta=response.meta)
            request.meta['category'] = category
            yield request
Ejemplo n.º 11
0
    def parse(self, response):

        review_divs_xpath = "//div[@id='content']/div[@id='block-system-main']"
        review_divs = response.xpath(review_divs_xpath)

        for review_div in review_divs:
            date_xpath = ".//span[@class='date-display-single']/text()"
            dates = (review_div.xpath(date_xpath)).getall()
            for date in dates:
                review_date = datetime.strptime(date, '%d %b %Y')
                if review_date > self.stored_last_date:
                    review_urls_xpath = ".//p//a/@href"
                    review_urls = (
                        review_div.xpath(review_urls_xpath)).getall()
                    for review_url in review_urls:
                        review_url = get_full_url(response, review_url)
                        yield Request(url=review_url,
                                      callback=self.parse_items)

        next_page_xpath = "//a[@title='Go to next page']/@href"
        next_page = self.extract(response.xpath(next_page_xpath))
        next_page_url = get_full_url(response, next_page)

        review_date_xpath = "(//div[@id='block-system-main']//span[@class='date-display-single']/text())[last()]"
        review_date = self.extract(response.xpath(review_date_xpath))
        oldest_review_date = datetime.strptime(review_date, "%d %b %Y")

        if next_page:
            if oldest_review_date > self.stored_last_date:
                yield response.follow(next_page_url, callback=self.parse)
Ejemplo n.º 12
0
    def parse_category(self, response):
        products_xpath = "//div[@class='product_info']"
        product_url_xpath = "./div[@class='product_name']/a/@href"
        has_review_xpath = ".//div[@class='product_rating']/span[@class!='noRating']"
        next_page_xpath = "(//*[@rel='next'])[1]/@href"

        products = response.xpath(products_xpath)
        category = response.meta['category']
        category_name = category['category_path']

        # Not a leaf category page
        if not products:
            return

        for product in products:
            has_review = product.xpath(has_review_xpath)
            if not has_review:
                continue

            product_url = self.extract(product.xpath(product_url_xpath))
            product_url = get_full_url(response, product_url)
            request = Request(product_url, callback=self.parse_product)
            request.meta['OriginalCategoryName'] = category_name
            yield request

        next_page_url = self.extract_xpath(response, next_page_xpath)
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            next_page_request = Request(next_page_url,
                                        callback=self.parse_category)
            next_page_request.meta['category'] = category
            yield next_page_request
Ejemplo n.º 13
0
    def parse_category(self, response):
        products_xpath = "//div[@data-component='product-list-view']/article/div[@class='desc']"
        next_page_xpath = "//a[@class='next']/@href"

        product_url_xpath = "./a/@href"
        has_review_xpath = ".//*[contains(@class, 'reevoo-score')]"

        products = response.xpath(products_xpath)
        if not products:
            return

        # This category may be too general, but it helps if we know it can be skipped
        category_json_ld = extruct_helper.extract_json_ld(response.body, 'BreadcrumbList')
        if category_json_ld:
            category = extruct_helper.category_item_from_breadcrumbs_json_ld(category_json_ld)
            yield category
            if self.should_skip_category(category):
                return

        for product in products:
            has_review = product.xpath(has_review_xpath)
            if not has_review:
                continue
            product_url = self.extract(product.xpath(product_url_xpath))
            request = Request(url=get_full_url(response, product_url), callback=self.parse_product)
            yield request

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse_category)
            yield request
Ejemplo n.º 14
0
    def parse(self, response):

        review_divs_xpath = "//div[@id='content']"
        review_divs = response.xpath(review_divs_xpath)

        for review_div in review_divs:
            date_xpath = ".//span[@class='posted-on']/text()"
            dates = (review_div.xpath(date_xpath)).getall()
            for date in dates:
                date = str(date).replace(" at ", " ")
                review_date = dateparser.parse(date)
                if review_date:
                    if review_date > self.stored_last_date:
                        review_urls_xpath = ".//h2[@class='title']/a[@class='journal-entry-navigation-current']/@href"
                        review_urls = (
                            review_div.xpath(review_urls_xpath)).getall()
                        for review_url in review_urls:
                            review_url = get_full_url(response, review_url)
                            yield Request(url=review_url,
                                          callback=self.parse_items)

        next_page_xpath = "//span[@class='paginationControlNextPageSuffix']/a/@href"
        next_page = self.extract(response.xpath(next_page_xpath))
        next_page_url = get_full_url(response, next_page)

        review_date_xpath = "(//span[@class='posted-on']/text())[last()]"
        review_date = self.extract(response.xpath(review_date_xpath))
        date = str(review_date).replace(" at ", " ")
        oldest_review_date = dateparser.parse(date)

        if next_page:
            if oldest_review_date > self.stored_last_date:
                yield response.follow(next_page_url, callback=self.parse)
Ejemplo n.º 15
0
    def parse_category(self, response):
        next_page_xpath = "(//*[@rel='next'])[1]/@href"

        category = None
        if "category" in response.meta:
            category = response.meta['category']

        if not category:
            category = CategoryItem()
            category['category_path'] = self.extract_all(response.xpath('//ul[@id="breadcrumb"]//text()'), " > ")
            category['category_leaf'] = self.extract(response.xpath('//h1/text()'))
            category['category_url'] = response.url
            yield category

        if not self.should_skip_category(category):
            review_urls_xpath = '//div[@class="reviewsContainer"]/a[@id="ratingLink"]/@href'
            review_urls = self.extract_list(response.xpath(review_urls_xpath))
            for review_url in review_urls:
                review_url = get_full_url(response, review_url).strip('#reviewsTab')
                request = Request(url=review_url, callback=self.parse_product)
                request.meta['category'] = category
                yield request

        next_page_url = self.extract_all(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_request = Request(url=get_full_url(response, next_page_url), callback=self.parse_category)
            next_page_request.meta['category'] = category
            yield next_page_request
Ejemplo n.º 16
0
    def parse_category(self, response):
        reviewed_products = self.extract_list(response.xpath('//div[@id="products"]//a[@class="reviews"]/@href'))
        
        if reviewed_products:
            category = None

            if "category" in response.meta:
                category = response.meta['category']

            if not category:
                category = CategoryItem()
                category['category_path'] = self.extract_all(response.xpath('//ul[@id="headerCrumb"]//a/text()'), " ; ")
                category['category_leaf'] = self.extract(response.xpath('//ul[@id="headerCrumb"]/li[last()]/a/text()'))
                category['category_url'] = response.url
                yield category
                
            if not self.should_skip_category(category):
                for product in reviewed_products:
                    product = product.strip('#customer_reviews')
                    request = Request(get_full_url(response, product), callback=self.parse_product)
                    request.meta['category'] = category
                    yield request
                
                next_page = self.extract(response.xpath('//a[@title="Next"]/@href'))
                if next_page:
                    request = Request(get_full_url(response, next_page), callback=self.parse)
                    request.meta['category'] = category
                    yield request
Ejemplo n.º 17
0
    def parse_product(self, response):
        reviews = response.xpath('//section[article[contains(@class,"review")]]')
        if reviews:
            product = ProductItem()

            product['TestUrl'] = response.url
            product['OriginalCategoryName'] = 'Cell Phones'
            product['ProductName'] = self.extract(response.xpath('//meta[@itemprop="name"]/@content'))
            pic_url = self.extract(response.xpath('//meta[@property="og:image"]/@content'))
            product['PicURL'] = get_full_url(response, pic_url)
            product['ProductManufacturer'] = self.extract(response.xpath('//meta[@itemprop="brand"]/@content'))
            yield product

            user_reviews = reviews.xpath('./article[@itemprop="review"]')

            for review in user_reviews:
                user_review = ReviewItem()
                user_review['DBaseCategoryName'] = "USER"
                user_review['ProductName'] = product['ProductName']
                user_review['TestUrl'] = product['TestUrl']
                date = self.extract(review.xpath('.//span[@class="time"]/text()'))
                user_review['TestDateText'] = date_format(date, '')
                user_review['SourceTestRating'] = self.extract(review.xpath('.//meta[@itemprop="ratingValue"]/@content'))
                user_review['Author'] = self.extract(review.xpath('.//span[@itemprop="author"]/text()'))
                user_review['TestPros'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"positives")]/text()'), '; ')
                user_review['TestCons'] = self.extract_all(review.xpath(
                    './/div[contains(@class,"negatives")]/text()'), '; ')
                yield user_review

            pro_review_url = self.extract(reviews.xpath('./article[contains(@class,"expert")]/div/a/@href'))
            if pro_review_url:
                request = Request(url=get_full_url(response, pro_review_url), callback=self.parse_review)
                request.meta['product'] = product
                yield request
Ejemplo n.º 18
0
    def parse_sub_category(self, response):
        products_xpath = "//ul[@id='product-offer-list']/li[contains(@class, 'list-item')]"
        product_url_xpath = ".//h4[@class='item-name']/a/@href"
        has_reviews_xpath = './/div[@class="rating-in-words"]/a/@href'
        category = response.meta.get('category', None)
        if not category:
            category = CategoryItem()
            category['category_path'] = self.extract_all(
                    response.xpath('//ol[@class="breadcrumb"]//span/text()'), " > ")
            category['category_leaf'] = self.extract(response.xpath('//h1/text()'))
            category['category_url'] = response.url
            yield category

        if not self.should_skip_category(category):
            products = response.xpath(products_xpath)
            for product in products:
                has_reviews = product.xpath(has_reviews_xpath)
                if not has_reviews:
                    continue

                product_url = self.extract(product.xpath(product_url_xpath))
                if product_url:
                    product_url = get_full_url(response, product_url)
                    request = Request(url=product_url, callback=self.parse_product)
                    request.meta['category'] = category
                    yield request

            next_page_url = self.extract(response.xpath('//a[@class="next_page"]/@href'))
            if next_page_url:
                next_page_url = get_full_url(response, next_page_url)
                request = Request(url=next_page_url, callback=self.parse_sub_category)
                request.meta['category'] = category
                yield request
Ejemplo n.º 19
0
 def level_3(self, response):
                                  
     original_url = response.url
     
     url_xpath = "//a[@class='gspr next']/@href"
     single_url = self.extract(response.xpath(url_xpath))
     if single_url:
         matches = None
         if "":
             matches = re.search("", single_url, re.IGNORECASE)
         if matches:
             single_url = matches.group(0)
         single_url = get_full_url(original_url, single_url)
         
         request = Request(single_url, callback=self.level_3)
         
         yield request
     urls_xpath = "//div[@class='mimg itmcd img']//a[@class='vip']/@href"
     urls = self.extract_list(response.xpath(urls_xpath))
     for single_url in urls:
         matches = None
         if "":
             matches = re.search("", single_url, re.IGNORECASE)
         if matches:
             single_url = matches.group(0)
         single_url = get_full_url(original_url, single_url)
         
         request = Request(single_url, callback=self.level_4)
          
         yield request
Ejemplo n.º 20
0
    def parse_category(self, response):
        next_page_xpath = "//*[@rel='next']/@href"
        sub_category_xpath = "//div[@id='subCategorycategories']//a/@href"

        sub_cat_urls = self.extract_list(response.xpath(sub_category_xpath))

        if sub_cat_urls:
            for sub_cat_url in sub_cat_urls:
                sub_cat_url = get_full_url(response, sub_cat_url)
                request = Request(sub_cat_url, callback=self.parse_category)
                yield request
        else:
            product_url_xpath = "//div[@class='description']/a/@href"
            product_urls = self.extract_list(response.xpath(product_url_xpath))

            for product_url in product_urls:
                product_url = get_full_url(response, product_url)
                request = Request(product_url, callback=self.parse_product)
                yield request

            next_page_url = self.extract(response.xpath(next_page_xpath))
            if next_page_url:
                next_page_url = get_full_url(response, next_page_url)
                request = Request(next_page_url, callback=self.parse_category)
                yield request
Ejemplo n.º 21
0
    def parse_category(self, response):
        latest_review_date_xpath = '//div[contains(@class, '\
            '"field-name-field-published-date")]//text()'
        next_page_xpath = '//a[@title="Go to next page"]/@href'
        review_url_xpath = '//div[@id="content"]//p[@class="title"]'\
                           '/span/a/@href'

        review_urls = self.extract_list(response.xpath(review_url_xpath))
        for review_url in review_urls:
            review_url = get_full_url(response, review_url)
            request = Request(review_url, callback=self.parse_review)
            yield request

        # incremental scraping
        latest_review_date_text = self.extract_xpath(response,
                                                     latest_review_date_xpath)
        latest_review_date = dateparser.parse(latest_review_date_text)
        if latest_review_date and latest_review_date < self.stored_last_date:
            return

        next_page = self.extract(response.xpath(next_page_xpath))
        if next_page:
            next_page = get_full_url(response, next_page)
            request = Request(next_page, callback=self.parse_category)
            yield request
Ejemplo n.º 22
0
    def parse(self, response):
        category_name_xpath = '(//h1[1])/text()'
        products_xpath = "//ul[@class='products']/li"
        next_page_xpath = "(//*[@rel='next'])[1]/@href"
        product_url_xpath = "./a/@href"
        has_review_xpath = ".//ul[contains(@title, 'Average Rating')]"

        category = response.meta.get('category', '')
        if not category:
            category = CategoryItem()
            category['category_url'] = response.url
            category['category_leaf'] = self.extract(
                response.xpath(category_name_xpath))
            category['category_path'] = category['category_leaf']
            yield category

        products = response.xpath(products_xpath)
        for product in products:
            has_reviews = self.extract(product.xpath(has_review_xpath))

            if has_reviews:
                product_url = self.extract(product.xpath(product_url_xpath))
                product_url = get_full_url(response, product_url)
                request = Request(product_url, callback=self.parse_product)
                request.meta['category'] = category
                yield request

        next_page_url = self.extract(response.xpath(next_page_xpath))
        if next_page_url:
            next_page_url = get_full_url(response, next_page_url)
            request = Request(next_page_url, callback=self.parse)
            request.meta['category'] = category
            yield request
Ejemplo n.º 23
0
    def parse_category(self, response):
        category = None

        if 'category' in response.meta:
            category = response.meta['category']

        if not category:
            category = CategoryItem()
            category['category_path'] = self.extract_all(response.xpath('//div[contains(@class,"localizer")]/*/text()'))
            category['category_leaf'] = self.extract(response.xpath(
                    '//div[contains(@class,"localizer")]/span[last()]/text()'))
            category['category_url'] = response.url
            yield category

        if not self.should_skip_category(category):
            product_urls = self.extract_list(response.xpath(
                    '//div[contains(@class,"ckPoints")]/ancestor::div[@class="infoWrapper"]'
                    '//p[@class="title"]/a/@href'))

            for product_url in product_urls:
                product_url = get_full_url(response, product_url)+'/1/data_dodania/malejaco'
                request = Request(url=product_url, callback=self.parse_product)
                request.meta['category'] = category
                yield request

            next_page_urls = self.extract_list(response.xpath('//a[contains(@class,"right")]/@href'))
            if next_page_urls:
                next_page_url = next_page_urls[0]
                next_page_url = get_full_url(response, next_page_url)
                request = Request(url=next_page_url, callback=self.parse_category)
                request.meta['category'] = category
                yield request
Ejemplo n.º 24
0
    def parse(self, response):

        original_url = response.url

        url_xpath = "//a[img[@alt='vor']]/@href"
        single_url = self.extract(response.xpath(url_xpath))
        if single_url:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
            if matches:
                single_url = matches.group(0)
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.parse)

            yield request
        urls_xpath = "//li[contains(@class, 'ttboxpad')]//a[contains(@class, 'extra')]/@href"
        urls = self.extract_list(response.xpath(urls_xpath))
        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
            if matches:
                single_url = matches.group(0)
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            yield request
Ejemplo n.º 25
0
    def level_2(self, response):

        original_url = response.url

        product_xpaths = {
            "source_internal_id":
            "substring-before(substring-after(//body/@class,'postid-'),' ')",
            "ProductName": "//h1//text()",
            "OriginalCategoryName":
            "//meta[@property='article:section']/@content",
            "PicURL": "//meta[@property='og:image']/@content",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['TestUrl'] = original_url
        picurl = product.get("PicURL", "")
        if picurl and picurl[:2] == "//":
            product["PicURL"] = "https:" + product["PicURL"]
        if picurl and picurl[:1] == "/":
            product["PicURL"] = get_full_url(original_url, picurl)
        manuf = product.get("ProductManufacturer", "")
        if manuf == "" and ""[:2] != "//":
            product["ProductManufacturer"] = ""
        try:
            product["OriginalCategoryName"] = category['category_path']
        except:
            pass
        ocn = product.get("OriginalCategoryName", "")
        if ocn == "" and "//meta[@property='article:section']/@content"[:
                                                                        2] != "//":
            product[
                "OriginalCategoryName"] = "//meta[@property='article:section']/@content"
        review_xpaths = {
            "source_internal_id":
            "substring-before(substring-after(//body/@class,'postid-'),' ')",
            "ProductName": "//h1//text()",
            "TestDateText":
            "substring-before(//meta[contains(@property,'published_time')]/@content,'T')",
            "TestSummary": "//meta[@property='og:description']/@content",
            "Author": "//div[@class='single-info']/a//text()",
            "TestTitle": "//h1//text()",
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review['TestUrl'] = original_url
        try:
            review['ProductName'] = product['ProductName']
            review['source_internal_id'] = product['source_internal_id']
        except:
            pass
        awpic_link = review.get("AwardPic", "")
        if awpic_link and awpic_link[:2] == "//":
            review["AwardPic"] = "https:" + review["AwardPic"]
        if awpic_link and awpic_link[:1] == "/":
            review["AwardPic"] = get_full_url(original_url, awpic_link)

        review["DBaseCategoryName"] = "PRO"

        yield product

        yield review
Ejemplo n.º 26
0
    def parse(self, response):

        original_url = response.url
        product = response.meta.get("product", {})
        review = response.meta.get("review", {})

        url_xpath = "(//p[@class='pager']/span[@class='pages']/following::a)[1]/@href"
        single_url = self.extract(response.xpath(url_xpath))
        single_url = single_url.replace('\n', '')
        single_url = single_url.replace('\t', '')
        if single_url:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    return
            single_url = get_full_url(original_url, single_url)
            print '=' * 30
            print single_url
            request = Request(single_url, callback=self.parse)
            try:
                request.meta["product"] = product
            except:
                pass
            try:
                request.meta["review"] = review
            except:
                pass
            yield request
        urls_xpath = "//ul[@class='reset']/li/div[@class='spc']/h2/a/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            try:
                request.meta["product"] = product
            except:
                pass
            try:
                request.meta["review"] = review
            except:
                pass
            yield request
Ejemplo n.º 27
0
    def level_2(self, response):

        original_url = response.url

        product_xpaths = {
            "ProductName": "//h1[@itemprop='name']/text()",
            "OriginalCategoryName": "game",
            "PicURL": "//meta[@property='og:image']/@content",
        }
        product = self.init_item_by_xpaths(response, "product", product_xpaths)
        product['TestUrl'] = original_url
        picurl = product.get("PicURL", "")
        if picurl and picurl[:2] == "//":
            product["PicURL"] = "https:" + product["PicURL"]
        if picurl and picurl[:1] == "/":
            product["PicURL"] = get_full_url(original_url, picurl)
        manuf = product.get("ProductManufacturer", "")
        if manuf == "" and ""[:2] != "//":
            product["ProductManufacturer"] = ""
        try:
            product["OriginalCategoryName"] = category['category_path']
        except:
            pass
        ocn = product.get("OriginalCategoryName", "")
        if ocn == "" and "game"[:2] != "//":
            product["OriginalCategoryName"] = "game"
        review_xpaths = {
            "ProductName": "//h1[@itemprop='name']/text()",
            "TestPros":
            "//div[h2[contains(text(),'Pros')]]/following-sibling::div//li/text()",
            "TestCons":
            "//div[h2[contains(text(),'Cons')]]/following-sibling::div//li/text()",
            "TestSummary":
            "//div[@class='post_content']/p[contains(.,' ')][not(img)][1]//text()",
            "Author": "//a[@itemprop='author']/text()",
            "TestTitle": "//h1[@itemprop='name']/text()",
        }
        review = self.init_item_by_xpaths(response, "review", review_xpaths)
        review['TestUrl'] = original_url
        try:
            review['ProductName'] = product['ProductName']
            review['source_internal_id'] = product['source_internal_id']
        except:
            pass
        awpic_link = review.get("AwardPic", "")
        if awpic_link and awpic_link[:2] == "//":
            review["AwardPic"] = "https:" + review["AwardPic"]
        if awpic_link and awpic_link[:1] == "/":
            review["AwardPic"] = get_full_url(original_url, awpic_link)

        review["DBaseCategoryName"] = "PRO"

        yield product

        yield review
Ejemplo n.º 28
0
    def parse_category(self, response):
        is_category_xpath = "//*[@class='productList']"
        has_sub_cat_xpath = "//*[@class='Department']"
        category_path_xpath = "//ul[@class='breadcrumbList']/li[position() < last()]/a//text()"
        category_leaf_xpath = "//ul[@class='breadcrumbList']/li[last()]/a//text()"
        if not response.xpath(is_category_xpath):
            return

        if not response.xpath(has_sub_cat_xpath):
            next_page_xpath = "(//*[@rel='next'])[1]/@href"

            category = response.meta.get('category', None)
            if not category:
                category = CategoryItem()
                category['category_url'] = response.url
                category['category_leaf'] = self.extract(
                    response.xpath(category_leaf_xpath))
                category['category_path'] = self.extract_all(
                    response.xpath(category_path_xpath), separator=' | ')
                category['category_path'] = '%s | %s' % (
                    category['category_path'], category['category_leaf'])
                yield category

            if self.should_skip_category(category):
                return

            products_xpath = "//*[@class='productList']/li/div[@class='productInfo']"
            product_url_xpath = "./a[@class='productTitle'][1]/@href"
            product_rating_xpath = "./div[@class='bvRollup']"

            products = response.xpath(products_xpath)
            for product in products:
                has_reviews = self.extract(product.xpath(product_rating_xpath))

                if has_reviews:
                    product_url = self.extract(
                        product.xpath(product_url_xpath))
                    product_url = get_full_url(response, product_url)
                    request = Request(product_url, callback=self.parse_product)
                    request.meta['category'] = category
                    yield request

            next_page_url = self.extract(response.xpath(next_page_xpath))
            if next_page_url:
                next_page_url = get_full_url(response, next_page_url)
                request = Request(next_page_url, callback=self.parse_category)
                request.meta['category'] = category
                yield request
        else:
            subcat_url_xpath = "(//*[@class='Department']/following::ul[1])/li[not(contains(@class, 'hidden'))]/a/@href"
            subcat_urls = self.extract_list(response.xpath(subcat_url_xpath))
            for subcat_url in subcat_urls:
                subcat_url = get_full_url(response, subcat_url)
                request = Request(subcat_url, callback=self.parse_category)
                yield request
Ejemplo n.º 29
0
    def parse(self, response):

        original_url = response.url
        product = response.meta.get("product", {})
        review = response.meta.get("review", {})

        url_xpath = "//span[contains(.,'ta sida')]/../@href"
        single_url = self.extract(response.xpath(url_xpath))
        single_url = '/bloggen' + single_url
        if single_url:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    return
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.parse)
            try:
                request.meta["product"] = product
            except:
                pass
            try:
                request.meta["review"] = review
            except:
                pass
            yield request
        urls_xpath = "//div[@class='blogg_big_container'][contains(.,'Test')]//a[contains(@href,'article')]/@href"
        params_regex = {}
        urls = self.extract_list(response.xpath(urls_xpath))

        for single_url in urls:
            matches = None
            if "":
                matches = re.search("", single_url, re.IGNORECASE)
                if matches:
                    single_url = matches.group(0)
                else:
                    continue
            single_url = get_full_url(original_url, single_url)

            request = Request(single_url, callback=self.level_2)

            try:
                request.meta["product"] = product
            except:
                pass
            try:
                request.meta["review"] = review
            except:
                pass
            yield request
Ejemplo n.º 30
0
 def parse(self, response):
                                  
     original_url = response.url
     product = response.meta.get("product", {})
     review = response.meta.get("review", {})
     
     url_xpath = u"//div[@class='navigation']/div[contains(@class,'left')]//a/@href"
     single_url = self.extract(response.xpath(url_xpath))
     if single_url:
         matches = None
         if "":
             matches = re.search("", single_url, re.IGNORECASE)
             if matches:
                 single_url = matches.group(0)
             else:
                 return
         single_url = get_full_url(original_url, single_url)
         
         request = Request(single_url, callback=self.parse)
         try:
             request.meta["product"] = product
         except:
             pass
         try:
             request.meta["review"] = review
         except:
             pass
         yield request
     urls_xpath = u"//section//article//a/@href"
     params_regex = {}
     urls = self.extract_list(response.xpath(urls_xpath))
     
     for single_url in urls:
         matches = None
         if "":
             matches = re.search("", single_url, re.IGNORECASE)
             if matches:
                 single_url = matches.group(0)
             else:
                 continue
         single_url = get_full_url(original_url, single_url)
         
         request = Request(single_url, callback=self.level_2)
         
          
         try:
             request.meta["product"] = product
         except:
             pass
         try:
             request.meta["review"] = review
         except:
             pass
         yield request