def parse(self, response): div_zg_itemRow_lst = response.xpath('//div[@class="zg_itemRow"]') for div_zg_itemRow in div_zg_itemRow_lst: item = AmazonItem() item['rank_number'] = div_zg_itemRow.xpath( './/span[@class="zg_rankNumber"]/text()').extract_first() item['book_name'] = div_zg_itemRow.xpath( './/a[@class="a-link-normal"]/text()')[2].extract() item['author'] = div_zg_itemRow.xpath( './/span[@class="a-size-small a-color-base"]/text()' ).extract_first() item['star_rank'] = div_zg_itemRow.xpath( '///span[@class="a-icon-alt"]/text()').extract_first() item['book_type'] = div_zg_itemRow.xpath( './/span[@class="a-size-small a-color-secondary"]/text()' ).extract_first() item['price'] = div_zg_itemRow.xpath( './/span[@class="p13n-sc-price"]/text()').extract_first() yield item xpath_next_page = './/li[@class="zg_page zg_selected"]/following-sibling::li/a/@href' if response.xpath(xpath_next_page): url_next_page = response.xpath(xpath_next_page).extract_first() request = scrapy.Request(url_next_page, callback=self.parse) yield request
def parse_detail(self, response): li_list = response.xpath("//ul[@id = 's-results-list-atf']/li") for li in li_list: item = AmazonItem() # 所属分类 item["type"] = response.xpath( '//span[@class="a-color-state a-text-bold"]/text()' ).extract_first() # 商品图片 item["img_url"] = li.xpath('.//img/@src').extract_first() # 商品名称 item["product_name"] = li.xpath('.//h2/text()').extract_first() # 商品url item["product_url"] = li.xpath( './/div[@class = "a-row a-spacing-mini"]//a/@href' ).extract_first() # 商品价格 item["product_price"] = li.xpath( './/a[@class="a-link-normal a-text-normal"]/span/text()' ).extract_first() # 商品评分 item["product_score"] = li.xpath( './/a[@class="a-popover-trigger a-declarative"]//span[@class="a-icon-alt"]/text()' ).extract_first() # 商品运费 item["product_freight"] = li.xpath( './/a[@class="a-link-normal a-text-normal"]/following-sibling::span[@class="a-size-small a-color-secondary"]/text()' ).extract_first() yield item
def parse(self, response): #namelist = response.xpath('//a[@class="a-link-normal s-access-detail-page a-text-normal"]/@title').extract() #htmllist = response.xpath('//a[@class="a-link-normal s-access-detail-page a-text-normal"]/@href').extract() #imglist = response.xpath('//a[@class="a-link-normal a-text-normal"]/img/@src').extract() namelist = response.xpath( '//a[@class="a-link-normal s-access-detail-page s-overflow-ellipsis a-text-normal"]/@title' ).extract() htmllist = response.xpath( '//a[@class="a-link-normal s-access-detail-page s-overflow-ellipsis a-text-normal"]/@href' ).extract() imglist = response.xpath( '//img[@class="s-access-image cfMarker"]/@src').extract() listlength = len(namelist) pwd = os.getcwd() + '/' if not os.path.isdir(pwd + 'crawlImages/'): os.mkdir(pwd + 'crawlImages/') for i in range(0, listlength): item = AmazonItem() item['Name'] = namelist[i] item['Source'] = htmllist[i] urllib.urlretrieve( imglist[i], pwd + "crawlImages/" + str(amazonSpider.imgcount) + ".jpg") item['Path'] = pwd + "crawlImages/" + str( amazonSpider.imgcount) + ".jpg" amazonSpider.imgcount = amazonSpider.imgcount + 1 yield item
def parse(self, response): items = AmazonItem() title = response.xpath('//a[@class ="a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal"]/@title').extract() sale_price = response.xpath('//span[@class ="a-size-base a-color-price s-price a-text-bold"]/@text').extract() items['product_name'] = ''.join(title).strip() items['product_sale_price'] = ''.join(sale_price).strip() yield items
def parse_item(self, response): ml_item = AmazonItem() ml_item['ASIN'] = response.xpath( 'normalize-space(//th[contains(.,"ASIN")]//following-sibling::td/text())' ).extract() ml_item['Title'] = response.xpath( 'normalize-space(//span[contains(@id,"productTitle")]/text())' ).extract() ml_item['Description'] = response.xpath( 'normalize-space(//div[contains(@id,"productDescription")]/div/p[1]/text())' ).extract() ml_item['Price'] = response.xpath( 'normalize-space(//span[contains(@id,"priceblock_ourprice")]/text())' ).extract() ml_item['List_price'] = response.xpath( 'normalize-space(//span[contains(@class,"a-text-strike")]/text())' ).extract() ml_item['Image_URL'] = response.xpath( 'normalize-space(//div[contains(@id,"imgTagWrapperId")]/img/@data-old-hires)' ).extract() self.count_item_scrapp += 1 if self.count_item_scrapp > MAX_CANT_TO_SEARCH: raise CloseSpider('item_exceeded') yield ml_item
def parse(self, response): # response. # li = response.selector.xpath('//title/text()').extract() # products = response.css('li[@id="result_0"]') # description = response.css('h2::attr(data-attribute)').extract() # products = response.css('h2::attr(data-attribute)').extract() products = response.css('li[id^="result"]') for prod in products: prod_url = prod.css( 'a.a-link-normal.s-access-detail-page.s-color-twister-title-link.a-text-normal::attr(href)' ).extract_first() prod_name = prod.css('h2::attr(data-attribute)').extract_first() item = AmazonItem() item['name'] = prod_name item['url'] = response.urljoin(prod_url) print(item) yield item # print(prod_list) # response.css('span.pagnRA a::attr(href)') next_page = response.css('span.pagnRA a::attr(href)').extract_first() if next_page is not None: yield response.follow(next_page, callback=self.parse) print(products) # response.css('li[id^="result"]') pass
def parse_page_product(self, response): print response.meta['asin'] asin = response.meta['asin'] self.scraped_product_count += 1 item = AmazonItem() lightning_deal_url = response.xpath( "//a[@title='View Offer']/@href").extract() if len(lightning_deal_url) == 1: print lightning_deal_url[0] self.scraped_product_count -= 1 lightning_deal_url = "https://www." + self.allowed_domains[ 0] + lightning_deal_url[0] yield Request(lightning_deal_url, callback=self.parse_page_product, meta={'asin': asin}) else: new_response_body = response.body.partition("Deal Price:") if len(new_response_body[1]) == 0: new_response_body = response.body.partition("Sale:") #print new_response_body[1] if len(new_response_body[1]) == 0: new_response_body = response.body.partition("Price:") if len(new_response_body[1]) != 0: price_text = new_response_body[2].replace(",", "") priceRegex = re.compile(r"(?<=\>)\s*\d+\.\d+|\d+(?=\<)") price_match = priceRegex.search(price_text) if price_match: price = price_match.group(0) product_price = price.replace(" ", "") item["name"] = self.productDataDict[asin]["name"] item["price"] = product_price old_price = self.productDataDict[asin]["price"] if len(old_price) > 0: new_price = round(float(product_price), 2) old_price = round(float(old_price), 2) if new_price < old_price: self.send_mail( old_price, new_price, asin, self.productDataDict[asin]["user_mail_id"], self.productDataDict[asin]["name"]) #print "Price Dropped!" #print asin + '\t=======>\t' + str(product_price) self.productDataDict[asin]["price"] = product_price if self.scraped_product_count == len(self.productDataDict): #print "inside file close1" self.fileObject.seek(0) self.fileObject.truncate() json.dump(self.productDataDict, self.fileObject) self.fileObject.close() #item["url"] = yield item else: self.productDataDict[asin]["url"] = "" if self.scraped_product_count == len(self.productDataDict): #print "inside file close2" self.fileObject.seek(0) self.fileObject.truncate() json.dump(self.productDataDict, self.fileObject) self.fileObject.close() print 'Failed to get the price of the product'
def parse_products(self, response): hxs = HtmlXPathSelector(response) items = [] item = AmazonItem() item['title'] = hxs.select( '//div[@class="a-section a-spacing-none"]/h1/span[@id="productTitle"]/text()' ).extract() item['brand'] = hxs.select('//a[@id="brand"]/text()').extract() item['specs'] = hxs.select( '//div[@class="pdTab"][1]//node()').extract() item['offerprice'] = hxs.select( '//span[@id="priceblock_ourprice"]/text()').extract() item['saleprice'] = hxs.select( '//span[@id="priceblock_saleprice"]/text()').extract() item['description'] = hxs.select( '//div[@id="productDescription"]//text()').extract() item['feature'] = hxs.select( '//ul[@class="a-vertical a-spacing-none"]/li/span/text()').extract( ) item['image'] = hxs.select( '//span[@class="a-button-text"]/img/@src').extract() item['link'] = response.meta["url"] item['seller'] = hxs.select( '//div[@id="merchant-info"]/a[1]/text()').extract() item['sellrating'] = hxs.select( '//div[@id="merchant-info"]/text()').extract() item['starating'] = hxs.select( '//a[@class="a-link-normal"]/i/span/text()').extract()[0] item['COD'] = "Available" item['category'] = "Cameras, Audio & Video" item['subcategory'] = "Home Audio & Video Accessories" items.append(item) return items
def parse_products(self, response): hxs = HtmlXPathSelector(response) items = [] item = AmazonItem() item['title'] = hxs.select( '//div[@class="a-section a-spacing-none"]/h1/span[@id="productTitle"]/text()' ).extract() item['brand'] = hxs.select('//a[@id="brand"]/text()').extract() item['specs'] = hxs.select( '//div[@class="pdTab"][1]//node()').extract() item['offerprice'] = hxs.select( '//span[@class="a-size-medium a-color-price inlineBlock-display offer-price a-text-normal price3P"]/text()' ).extract() item['saleprice'] = hxs.select( '//span[@id="priceblock_saleprice"]/text()').extract() item['description'] = hxs.select( '//div[@id="productDescription"]//text()').extract() item['feature'] = hxs.select( '//ul[@class="a-vertical a-spacing-none"]/li/span/text()').extract( ) item['image'] = hxs.select( '//div[@id="img-canvas"]/img/@src').extract() item['link'] = response.meta["url"] item['seller'] = hxs.select( '//div[@id="merchant-info"]/a[1]/text()').extract() item['sellrating'] = hxs.select( '//div[@id="merchant-info"]/text()').extract() item['starating'] = hxs.select( '//a[@class="a-link-normal"]/i/span/text()').extract()[0] item['COD'] = "Available" item['category'] = "Books" item['subcategory'] = "Tamil" items.append(item) return items
def parse(self, response): item = AmazonItem() for data in response.css( '.s-include-content-margin .sg-row , .a-color-base.a-text-normal' ): # print(data) # yield data item['name'] = str( data.css('span.a-color-base.a-text-normal::text').get()) item['author'] = str( data.css('.a-color-secondary .a-size-base.a-link-normal::text' ).get()).strip().split('\n')[0] item['price'] = str( data.css('.a-spacing-top-small .a-price-whole::text').get()) if (item['name'] != "None" and item['author'] != "None" and item['price'] != "None"): yield item print("---" * 50) else: pass next_page = 'https://www.amazon.com/s?i=specialty-aps&srs=17276793011&page=' + str( QuotesSpider.number) + '&qid=1570782228&ref=lp_17276793011_pg_2' if (QuotesSpider.number <= 10): QuotesSpider.number += 1 yield response.follow(next_page, callback=self.parse)
def parse_item(self, response): print('------------------parse_item-----------------------') item = AmazonItem() #qa_list = response.xpath("//div[starts-with(@id,'question-')]") #for info in qa_list: question = response.xpath( "normalize-space(//div[@class='cdQuestionText']/text())" ).extract_first() answer = response.xpath( "normalize-space(//div[@class='cdMessageInfo']/span[contains(@id,'cdPostContentBox_')]/text())" ).extract() #question = info.xpath("normalize-space(div[@class='a-fixed-left-grid-inner']/div[@class='a-fixed-left-grid-col a-col-right']/a[@class='a-link-normal']/text())").extract() #answer = response.xpath("normalize-space(//div[@class='cdMessageInfo']/span[contains(@id,'cdPostContentBox_')]/text())").extract() #question_url = info.xpath("normalize-space(div[@class='a-fixed-left-grid-inner']/div[@class='a-fixed-left-grid-col a-col-right']/a[@class='a-link-normal']/@href)").extract_first() #print(question_url) item['question'] = question item['answer'] = answer yield item
def parse_item(self, response): item = AmazonItem() item['name'] = response.xpath( '//span[@id="productTitle"]/text()').extract_first() if item['name'] is None: item['name'] = response.xpath( '//span[@id="ebooksProductTitle"]/text()').extract() item['name'] = "".join([i.strip() for i in item['name']]) item['author'] = response.xpath( '//div[@id="bylineInfo"]//a/text()').extract() item['author'] = (',').join( [i.strip() for i in item['author'] if i.strip()]) item['comments'] = response.xpath( '//a[@id="cmrs-atf"]/text()').extract() item['img'] = response.xpath( '//div[@id="ebooksImageBlockContainer"]//img/@data-a-dynamic-image' ).extract() item['pub_date'] = response.xpath( '//div[@class="buying"]/span[2]/text()').extract_first() item['price'] = response.xpath( '//span[@class="a-color-price"]//text()').extract_first() if item['price'] is None: item['price'] = response.xpath( '//span[@class="a-size-base a-color-price a-color-price"]//text()' ).get() item['price'] = item['price'].strip() item['comments'] = response.xpath( '//span[@id="acrCustomerReviewText"]/text()').get() else: item['pub_date'] = response.xpath( '//h1[@id="title"]//span/text()').extract() item['pub_date'] = re.sub(' ', '', "".join(item['pub_date'])) item['author'] = response.xpath( '//div[@id="bylineInfo"]//a/text()').extract() item['author'] = (',').join( [i.strip() for i in item['author'] if i.strip()]) item['price'] = response.xpath( '//span[@class="a-size-base a-color-price a-color-price"]//text()' ).get() if not item['price']: item['price'] = response.xpath( '//span[contains(@class,"a-size-base")]/text()').get() item['price'] = item['price'].strip() item['comments'] = response.xpath( '//span[@id="acrCustomerReviewText"]/text()').get() item['img'] = response.xpath( '//div[@id="imageBlockContainer"]//img/@data-a-dynamic-image' ).extract() item['cate'] = response.xpath( '//ul[@class="a-unordered-list a-horizontal a-size-small"]//span[@class="a-list-item"]//text()' ).extract() item['cate'] = ">".join([i.strip() for i in item['cate']]) item['url'] = response.url item['version'] = response.xpath( '//li[@class="swatchElement selected"]//a/span/text()' ).extract_first() yield item
def parse(self, response): items = AmazonItem() title = response.xpath('//h1[@id="title"]/span/text()').extract() sale_price = response.xpath( '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()' ).extract() items['product_name'] = ''.join(title).strip() items['product_sale_price'] = ''.join(sale_price).strip() yield items
def parse(self, response): items = AmazonItem() title = response.xpath('//h1[@id="title"]/span/text()').extract() sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract() category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract() availability = response.xpath('//div[@id="availability"]//text()').extract() items['product_name'] = ''.join(title).strip() items['product_sale_price'] = ''.join(sale_price).strip() items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip() items['product_availability'] = ''.join(availability).strip()
def parse_items(self, response): items = AmazonItem() items['title'] = response.xpath( './/*[@id="productTitle"]/text()').extract() items['features'] = response.xpath( './/*[@id="feature-bullets"]/ul//li/span/text()').extract() items['product_url'] = response.url items['image_urls'] = response.xpath( './/*[@id="landingImage"]/@src').extract() yield items
def parse(self, response): print ">>>>>", response.request.url sel = Selector(response) items = [] item = AmazonItem() item['url'] = response.request.url #import pdb;pdb.set_trace() #item['href'] = sel.xpath('//div[@class="a-row a-spacing-none"]/a[@class="a-link-normal a-text-normal"]/@href').extract() title = str( list( map(unicode.strip, sel.xpath('//span[@id="productTitle"]/text()').extract()))) title = title.replace("u'", "").replace("[", "").replace("]", "") item['title'] = title brand = str( list( map(unicode.strip, sel.xpath('//a[@id="brand"]/text()').extract()))) brand = brand.replace("u'", "").replace("[", "").replace("]", "") item['brand'] = brand import pdb pdb.set_trace() price = str( list( map( unicode.strip, sel.xpath( '//span[@class="olp-padding-right"]//span[@class="a-color-price"]//text()' ).extract()))) price = price.replace("Rs.", "").replace("[", "").replace( "]", "").replace("u'", "").replace(",", "").replace("'", "").replace("$", "") item['price'] = float(price) * 67.30 desc = str( list( map( unicode.strip, sel.xpath( '//div/ul[@class="a-vertical a-spacing-none"]/li//text()' ).extract()))) desc = desc.replace("[", "").replace("]", "").replace("u'", "") item['desc'] = desc category = str( list( map( unicode.strip, sel.xpath( '//div[@data-feature-name="wayfinding-breadcrumbs"]/ul/li//a//text()' ).extract()))) category = category.replace("u'", "").replace("[", "").replace( "]", "").replace('"', '').replace("'", "") item['category'] = category if item['title'] and item['category'] and item['price']: return item
def parse(self, response): book = AmazonItem() book['books_name'] = response.xpath('//div[@class="p13n-sc-truncate p13n-sc-line-clamp-1"]/text()').extract() book['author'] = response.xpath('//span[@class="a-size-small a-color-base"]/text()').extract() book['price'] = response.xpath('//span[@class="p13n-sc-price"]/text()').extract() book['books_link'] = response.xpath('//a[@class="a-link-normal a-text-normal"]/@href').extract() yield book for i in range(1, 2): url = 'https://www.amazon.ca/Best-Sellers-Books/zgbs/books/ref=zg_bs_pg_' + str(i) + \ '?_encoding=UTF8&pg=' + str(i) yield Request(url=url, callback=self.parse)
def parse(self, response): #// *[ @ id = "reviewer1"] / td[3] / a for reviewer in response.xpath( '//tr[contains(@id, "reviewer1")]/td[3]/a'): name = reviewer.xpath('b/text()').extract() href = reviewer.xpath('@href').extract() rev_url = 'http://www.amazon.com' + href[0] self.driver.get(rev_url) rev_id = rev_url.split('/')[-1] if rev_id == '': rev_id = response.url.split('/')[-2] usr_xpath = '//a[@id="/gp/profile/' + rev_id + '"]' see_more_xpath = '//a[@class="a-declarative"]' # email_xpath = '//*[@id="a-page"]/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div/div[1]/div[2]/a' email_xpath = '//span[contains(@class, "a-size-small a-color-link break-word pr-show-email")]' email = '' temp_cookies = self.driver.get_cookies() with open(self.cookie_file_name, 'w') as buffers: json.dump(temp_cookies, buffers) #// *[ @ id = "a-page"] / div[2] / div / div[1] / div / div / div / div[2] / div / div[1] / div / span #//span[contains(@class, "public-name-text")] eny_id = '-' try: eny_id = re.compile(r'.*\/profile\/(\w+)').search( rev_url).group(1) email_attempt = self.email_fetch(eny_id) print 'email_attempt@@@@@@@@@@@: ' + email_attempt except: email = '-' # /gp/profile/A1WPFIZ8P3O86V sel = scrapy.Selector(text=self.driver.page_source) if email != '-': email = sel.xpath(usr_xpath + '/text()').extract()[0] item = AmazonItem() item['name'] = name item['email'] = email item['idstr'] = eny_id yield item self.i += 1 if self.i <= self.end: yield scrapy.Request( 'http://www.amazon.com/review/top-reviewers?page=' + str(self.i), callback=self.parse)
def parse(self, response): items = AmazonItem() title = response.xpath('//*[@id="dealTitle"]/span/text()').extract() sale_price = response.xpath( '//*[@id="100_dealView_0"]/div/div[2]/div/div/div[3]/div[1]/span/text()' ).extract() # category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract() # availability = response.xpath('//div[@id="availability"]//text()').extract() items['product_deal'] = ''.join(title).strip() items['product_sale_price'] = ''.join(sale_price).strip() # items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip() # items['product_availability'] = ''.join(availability).strip() yield items
def get_next_page_data(self, response): item = AmazonItem() product_category = response.meta['product_category'] product_link_list = response.xpath( '//ol[@id="zg-ordered-list"]/li/span/div/span/a/@href').extract() for link in product_link_list: item['product_url'] = self.host + link.split('?')[0] item['product_url_page_num'] = item['product_url'].split('/')[-1] item['product_category'] = product_category item['update_time'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") item['error'] = 'yes' yield item
def next(self, response): print response.url item = AmazonItem() item['url'] = response.url print item['url'] item['title'] = response.xpath('//span[id="productTitle"]').extract() print item['title'] item['price'] = response.xpath('//span[@id="priceblock_ourprice"]/text()').extract() print item['price'] yield item
def parse(self, response): Link = response.css('.a-text-normal').css('a::attr(href)').extract() Title = response.css('span.a-text-normal').css('::text').extract() # for each product, create an AmazonItem, populate the fields and yield the item for result in zip(Link, Title): item = AmazonItem() item['title_Product'] = result[1] item['link_Product'] = result[0] # extract ASIN from link ASIN = re.findall(r"(?<=dp/)[A-Z0-9]{10}", result[0])[0] item['ASIN_Product'] = ASIN item['url_response'] = response.url yield item
def detail_parse(self,response): item = AmazonItem() result = response.xpath('//div[@id="centerCol"]') brand = result.xpath('.//a[@id="bylineInfo"]/text()').extract()[0] price = result.xpath('.//span[@id="priceblock_ourprice"]/text()').extract()[0] desc = result.xpath('.//span[@id="productTitle"]/text()').extract()[0].strip() shop = result.xpath('.//span[@id="ddmMerchantMessage"]/a/text()').extract()[0] shop_url = urljoin("https://www.amazon.cn",result.xpath('.//span[@id="ddmMerchantMessage"]/a/@href').extract()[0]) storage_lst = ["brand","price","desc","shop","shop_url"] data_lst = [brand,price,desc,shop,shop_url] for i in range(len(data_lst)): item[storage_lst[i]] = data_lst[i] # print(item) return item
def parse(self, response): con = cx_Oracle.connect('Dhiren/[email protected]/xe') cursor = con.cursor() items = AmazonItem() extractor = LinkExtractor(allow_domains='amazon.in') links = extractor.extract_links(response) # title = response.xpath('//*[@id="rev-dpReviewsMostHelpfulAUI-R1GC3CZRYZE7LY"]/div[1]/div/a[2]/span').extract() # sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract() # category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract() # availability = response.xpath('//div[@id="availability"]//text()').extract() #review = response.xpath('//*[@id="revData-dpReviewsMostHelpfulAUI-R2BCSE4PLKSH7M"]/div/text()').extract() #y = 0 # s = response.xpath('//*[@id="productTitle"]/text()').extract # print (s) # for link in links: # print link.url for link in response.xpath('//*[@id="revMHRL"]/div'): # if "J7-Prime" in response.url: # items['product_review_j7'] = link.xpath('.//div[@class="a-section"]/text()').extract() # cursor.execute("insert into reviews(review) values ('" + ''.join(link.xpath('.//div[@class="a-section"]/text()').extract())+ "')") t = ''.join(link.xpath('.//div/div/a[2]/span/text()').extract()) r = ''.join( link.xpath('.//div[@class="a-section"]/text()').extract()) print(t) cursor.execute("insert into reviews(review) values ('" + r + "')") con.commit() cursor.close() con.close() # else: # items['product_review_navy'] = link.xpath('.//div[@class="a-section"]/text()').extract() #items['product_review'] = ''.join(review).strip() #items['y'] = link.xpath('.//div[@class="a-section"]/text()').extract() # yield items #y = link.xpath('@href').extract() #print y # items['product_name'] = ''.join(title).strip() # items['product_sale_price'] = ''.join(sale_price).strip() # items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip() # items['product_availability'] = ''.join(availability).strip() #//*[@id="revData-dpReviewsMostHelpfulAUI-R2BCSE4PLKSH7M"]/div #//*[@id="revData-dpReviewsMostHelpfulAUI-R1GC3CZRYZE7LY"]/div #///*[@id="revMH"]
def parse(self, response): # 提取大分类:分组、遍历 li_list = response.xpath( "//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-one']/div/li" ) for li in li_list: item = AmazonItem() item["main_title"] = li.xpath(".//a/span/text()").extract_first() item["main_title_url"] = li.xpath(".//a/@href").extract_first() item["main_title_url"] = urljoin(response.url, item["main_title_url"]) # 请求列表页 yield scrapy.Request(item["main_title_url"], meta={"item": deepcopy(item)}, callback=self.vice_title_list)
def parse_vice_title(self, response): item = AmazonItem() li_list = response.xpath( "//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-two']/div/li" ) for li in li_list: item["vice_title"] = li.xpath( "./span/a/span/text()").extract_first() item["vice_title_url"] = "https://www.amazon.cn" + li.xpath( "./span/a/@href").extract_first() item["vice_title_url"] = "https://www.amazon.cn" + li.xpath( "./span/a/@href").extract_first() yield scrapy.Request(item["vice_title_url"], meta={"item": deepcopy(item)}, callback=self.books_detail)
def parse(self, response): ml_item = AmazonItem() ml_item['ASIN'] = response.xpath( 'normalize-space(//th[contains(.,"ASIN")]//following-sibling::td/text())' ).extract() ### .com # ml_item['Title'] = response.xpath('normalize-space(//span[contains(@id,"productTitle")]/text())').extract() # ml_item['Description'] = response.xpath('normalize-space(//div[contains(@id,"productDescription")]/div/p[1]/text())').extract() ml_item['Price'] = response.xpath( 'normalize-space(//span[contains(@id,"priceblock_ourprice")]/text())' ).extract() ml_item['List_price'] = response.xpath( 'normalize-space(//span[contains(@class,"a-text-strike")]/text())' ).extract() #ml_item['Image_URL'] = response.xpath('normalize-space(//div[contains(@id,"imgTagWrapperId")]/img/@data-old-hires)').extract() yield ml_item
def parse_detail(self, response): item = AmazonItem() # 获取手机名称 name = response.xpath( '//span[@id="productTitle"]/text()').extract_first().strip() # 获取价格 price = response.xpath( '//span[@id="priceblock_ourprice"]/text()').extract_first() # 提交数据 if name is not None and price is not None: item["name"] = name item["price"] = price yield item
def parse(self, response): #爬取 productname = response.xpath( "normalize-space(//h1[@id='title']/span[@id='productTitle']/text())" ).extract() description = response.xpath( "//div[@id='featurebullets_feature_div']/div[@id='feature-bullets']/ul[@class='a-unordered-list a-vertical a-spacing-none']/li/span[@class='a-list-item']/text()" ).extract() item = AmazonItem() item['productname'] = productname item['description'] = description yield item
def parse_mobile(self,response): #url和brand item = AmazonItem() url=response.url print "-----------------amazon--------------" print url brand=response.meta['brand'] item['brand']=brand item['url']=url #_id aa = dict([(k, v[0]) for k, v in urlparse.parse_qs(urlparse.urlparse(url).query).items()]) qid = [] if aa.has_key('qid'): qid = aa['qid'] count=response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract_first() if count: count = count.encode("GBK", "ignore") end = count.find(' ') id = qid + count[0:end] else: id=qid+'0' item['_id']=id #型号 attribute_list=response.xpath('//*[@id="prodDetails"]//div[1]/div/div[2]/div/div/table/tbody/tr/td[1]/text()').extract() value_list=response.xpath('//*[@id="prodDetails"]//div[1]/div/div[2]/div/div/table/tbody/tr/td[2]/text()').extract() len_list=len(attribute_list) temp='型号' temp=temp.encode("GBK", "ignore") item['model']='' for i in xrange(len_list): attribute = attribute_list[i].encode("GBK", "ignore") if attribute == temp: model = value_list[i] item['model'] = model #phone_name phone_name=brand+' '+item['model'] item['phone_name']=phone_name #平均分 average_score = response.xpath('//*[@id="summaryStars"]/a/i/span/text()').extract_first() item['average_score'] = average_score #进入评论页 review_list = [] review_url=response.xpath('//*[@id="revF"]/div/a/@href').extract_first() if review_url: url = review_url + "&pageNumber=1" yield scrapy.Request(url,meta={'review_list':review_list,'item':item},callback=self.parse_review)