def parse_profile(self, response): item = response.meta.get('item') email = response.xpath( '//a[contains(@href,"mailto")]/span/text()[contains(., "@")]' ).extract() if email: email = email[0].strip() else: return None cond_set(item, 'name', response.xpath( "//div[@class='profile-info']/div/div/span/text()" ).extract(), string.strip) cond_set_value(item, 'email', email) cond_set_value(item, 'reviewer', response.url) cond_set_value(item, 'country', self.COUNTRY) return item
def parse_profile(self, response): item = response.meta.get('item') email = response.xpath( '//a[contains(@href,"mailto")]/span/text()' ).extract() if email: email = email[0].strip() else: return None cond_set(item, 'name', response.xpath( "//div[@class='profile-info']/div/div/span/text()" ).extract(), string.strip) cond_set_value(item, 'email', email) cond_set_value(item, 'reviewer', response.url) cond_set_value(item, 'country', self.COUNTRY) return item
def _scrape_product_links(self, response): products = response.xpath('//li[@class="s-result-item"]') for pr in products: if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")] |' './/h5[contains(text(), "Sponsored")]'): continue product = ProductItem() cond_set(product, 'title', pr.xpath('.//h2/../@title').extract()) cond_set(product, 'product_image', pr.xpath('.//img[@alt="Product Details"]/@src').extract()) cond_set( product, 'brand', pr.xpath('.//div[@class="a-fixed-left-grid-col a-col-right"]' '/div/div/span[2]/text() |' './/div[@class="a-row a-spacing-mini"]/span[2]/text()' ).extract()) cond_set( product, 'price', pr.xpath( './/span[contains(@class,"s-price")]/text()').extract()) cond_set(product, 'asin', pr.xpath('@data-asin').extract()) if pr.xpath('.//i[contains(@class, "a-icon-prime")]'): cond_set_value(product, 'prime', True) else: cond_set_value(product, 'prime', False) cond_set( product, 'shipping_price', pr.xpath( './/span[contains(@class,"s-price")]/' 'following::span[2]/text()').re('(\d+.?\d+) shipping')) new = pr.xpath('.//a[contains(text(),"new")]/span/text()') if new: cond_set(product, 'new_price', new.extract()) cond_set(product, 'new_offers', new[1].re('\d+')) used = pr.xpath('.//a[contains(text(),"used")]/span/text()') if used: cond_set(product, 'used_price', used.extract()) cond_set(product, 'used_offers', used[1].re('\d+')) cond_set( product, 'rating', pr.xpath('.//span[contains(@name,"' + product['asin'] + '")]/span/a/i/span').re('(\d+.?\d+)')) cond_set( product, 'number_of_reviews', pr.xpath('.//span[contains(@name,"' + product['asin'] + '")]/' 'following::a[1]/text()').re('([\d+,?]+\d+)')) category = pr.xpath( './/span[contains(@class,"a-text-bold")]/text()').re('(.*):') if not category: category = response.xpath( '//div[@id="autoscoping-backlink"]/div/span/span/text()' ).extract() cond_set(product, 'category', category) number_of_items = pr.xpath( './/span[contains(@class,"a-text-bold")]/../text()').re( '([\d+,?]+\d+)') if number_of_items: cond_set_value(product, 'number_of_items', number_of_items[0]) else: cond_set_value(product, 'number_of_items', response.meta.get('total_matches')) product['all_brands'] = response.xpath( '//h2[text()="Brand"]/following::ul[1]/' 'li[@class="refinementImage"]/a/span/text()').extract() yield product
def _scrape_product_links(self, response): products = response.xpath('//li[@class="s-result-item"]') for pr in products: if pr.xpath('.//h5[contains(@class, "s-sponsored-list-header")]'): continue product = ProductItem() cond_set(product, 'title', pr.xpath('.//h2/../@title').extract()) cond_set(product, 'product_image', pr.xpath('.//img[@alt="Product Details"]/@src').extract()) cond_set(product, 'brand', pr.xpath( './/div[@class="a-fixed-left-grid-col a-col-right"]' '/div/div/span[2]/text()').extract()) cond_set(product, 'price', pr.xpath( './/span[contains(@class,"s-price")]/text()' ).extract()) cond_set(product, 'asin', pr.xpath('@data-asin').extract()) if pr.xpath('.//i[contains(@class, "a-icon-prime")]'): cond_set_value(product, 'prime', True) else: cond_set_value(product, 'prime', False) cond_set(product, 'shipping_price', pr.xpath( './/span[contains(@class,"s-price")]/' 'following::span[2]/text()').re('(\d+.?\d+) shipping')) new = pr.xpath('.//a[contains(text(),"new")]/span/text()') if new: cond_set(product, 'new_price', new.extract()) cond_set(product, 'new_offers', new[1].re('\d+')) used = pr.xpath('.//a[contains(text(),"used")]/span/text()') if used: cond_set(product, 'used_price', used.extract()) cond_set(product, 'used_offers', used[1].re('\d+')) cond_set(product, 'rating', pr.xpath( './/span[contains(@name,"'+product['asin']+'")]/span/a/i/span' ).re('(\d+.?\d+)')) cond_set(product, 'number_of_reviews', pr.xpath( './/span[contains(@name,"'+product['asin']+'")]/' 'following::a[1]/text()').re('([\d+,?]+\d+)')) cond_set(product, 'category', pr.xpath( './/span[contains(@class,"a-text-bold")]/text()' ).re('(.*):')) number_of_items = pr.xpath( './/span[contains(@class,"a-text-bold")]/../text()' ).re('([\d+,?]+\d+)') if number_of_items: cond_set_value(product, 'number_of_items', number_of_items[0]) # product['url'] = pr.xpath('.//h2/../@href')[0].extract() # cond_set(product, 'url', pr.xpath('.//h2/../@href').extract()) yield product