예제 #1
0
    def parse(self, response):
        div_zg_itemRow_lst = response.xpath('//div[@class="zg_itemRow"]')

        for div_zg_itemRow in div_zg_itemRow_lst:
            item = AmazonItem()
            item['rank_number'] = div_zg_itemRow.xpath(
                './/span[@class="zg_rankNumber"]/text()').extract_first()
            item['book_name'] = div_zg_itemRow.xpath(
                './/a[@class="a-link-normal"]/text()')[2].extract()
            item['author'] = div_zg_itemRow.xpath(
                './/span[@class="a-size-small a-color-base"]/text()'
            ).extract_first()
            item['star_rank'] = div_zg_itemRow.xpath(
                '///span[@class="a-icon-alt"]/text()').extract_first()
            item['book_type'] = div_zg_itemRow.xpath(
                './/span[@class="a-size-small a-color-secondary"]/text()'
            ).extract_first()
            item['price'] = div_zg_itemRow.xpath(
                './/span[@class="p13n-sc-price"]/text()').extract_first()
            yield item

            xpath_next_page = './/li[@class="zg_page zg_selected"]/following-sibling::li/a/@href'
            if response.xpath(xpath_next_page):
                url_next_page = response.xpath(xpath_next_page).extract_first()
                request = scrapy.Request(url_next_page, callback=self.parse)
                yield request
예제 #2
0
    def parse_detail(self, response):
        li_list = response.xpath("//ul[@id = 's-results-list-atf']/li")

        for li in li_list:
            item = AmazonItem()
            # 所属分类
            item["type"] = response.xpath(
                '//span[@class="a-color-state a-text-bold"]/text()'
            ).extract_first()
            # 商品图片
            item["img_url"] = li.xpath('.//img/@src').extract_first()
            # 商品名称
            item["product_name"] = li.xpath('.//h2/text()').extract_first()
            # 商品url
            item["product_url"] = li.xpath(
                './/div[@class = "a-row a-spacing-mini"]//a/@href'
            ).extract_first()
            # 商品价格
            item["product_price"] = li.xpath(
                './/a[@class="a-link-normal a-text-normal"]/span/text()'
            ).extract_first()
            # 商品评分
            item["product_score"] = li.xpath(
                './/a[@class="a-popover-trigger a-declarative"]//span[@class="a-icon-alt"]/text()'
            ).extract_first()
            # 商品运费
            item["product_freight"] = li.xpath(
                './/a[@class="a-link-normal a-text-normal"]/following-sibling::span[@class="a-size-small a-color-secondary"]/text()'
            ).extract_first()
            yield item
예제 #3
0
    def parse(self, response):
        #namelist = response.xpath('//a[@class="a-link-normal s-access-detail-page  a-text-normal"]/@title').extract()
        #htmllist = response.xpath('//a[@class="a-link-normal s-access-detail-page  a-text-normal"]/@href').extract()
        #imglist = response.xpath('//a[@class="a-link-normal a-text-normal"]/img/@src').extract()
        namelist = response.xpath(
            '//a[@class="a-link-normal s-access-detail-page s-overflow-ellipsis a-text-normal"]/@title'
        ).extract()
        htmllist = response.xpath(
            '//a[@class="a-link-normal s-access-detail-page s-overflow-ellipsis a-text-normal"]/@href'
        ).extract()
        imglist = response.xpath(
            '//img[@class="s-access-image cfMarker"]/@src').extract()
        listlength = len(namelist)

        pwd = os.getcwd() + '/'

        if not os.path.isdir(pwd + 'crawlImages/'):
            os.mkdir(pwd + 'crawlImages/')

        for i in range(0, listlength):
            item = AmazonItem()
            item['Name'] = namelist[i]
            item['Source'] = htmllist[i]

            urllib.urlretrieve(
                imglist[i],
                pwd + "crawlImages/" + str(amazonSpider.imgcount) + ".jpg")
            item['Path'] = pwd + "crawlImages/" + str(
                amazonSpider.imgcount) + ".jpg"
            amazonSpider.imgcount = amazonSpider.imgcount + 1
            yield item
예제 #4
0
 		def parse(self, response):
 			items = AmazonItem()
 			title = response.xpath('//a[@class ="a-link-normal s-access-detail-page  s-color-twister-title-link a-text-normal"]/@title').extract()
 			sale_price = response.xpath('//span[@class ="a-size-base a-color-price s-price a-text-bold"]/@text').extract()
 			items['product_name'] = ''.join(title).strip()
 			items['product_sale_price'] = ''.join(sale_price).strip()
 			yield items
예제 #5
0
    def parse_item(self, response):
        ml_item = AmazonItem()
        ml_item['ASIN'] = response.xpath(
            'normalize-space(//th[contains(.,"ASIN")]//following-sibling::td/text())'
        ).extract()
        ml_item['Title'] = response.xpath(
            'normalize-space(//span[contains(@id,"productTitle")]/text())'
        ).extract()
        ml_item['Description'] = response.xpath(
            'normalize-space(//div[contains(@id,"productDescription")]/div/p[1]/text())'
        ).extract()
        ml_item['Price'] = response.xpath(
            'normalize-space(//span[contains(@id,"priceblock_ourprice")]/text())'
        ).extract()
        ml_item['List_price'] = response.xpath(
            'normalize-space(//span[contains(@class,"a-text-strike")]/text())'
        ).extract()
        ml_item['Image_URL'] = response.xpath(
            'normalize-space(//div[contains(@id,"imgTagWrapperId")]/img/@data-old-hires)'
        ).extract()

        self.count_item_scrapp += 1

        if self.count_item_scrapp > MAX_CANT_TO_SEARCH:
            raise CloseSpider('item_exceeded')
        yield ml_item
예제 #6
0
    def parse(self, response):
        # response.
        # li = response.selector.xpath('//title/text()').extract()
        # products = response.css('li[@id="result_0"]')
        # description = response.css('h2::attr(data-attribute)').extract()
        # products = response.css('h2::attr(data-attribute)').extract()
        products = response.css('li[id^="result"]')

        for prod in products:
            prod_url = prod.css(
                'a.a-link-normal.s-access-detail-page.s-color-twister-title-link.a-text-normal::attr(href)'
            ).extract_first()
            prod_name = prod.css('h2::attr(data-attribute)').extract_first()

            item = AmazonItem()
            item['name'] = prod_name
            item['url'] = response.urljoin(prod_url)
            print(item)
            yield item

            # print(prod_list)

        # response.css('span.pagnRA a::attr(href)')
        next_page = response.css('span.pagnRA a::attr(href)').extract_first()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse)

        print(products)
        # response.css('li[id^="result"]')
        pass
예제 #7
0
 def parse_page_product(self, response):
     print response.meta['asin']
     asin = response.meta['asin']
     self.scraped_product_count += 1
     item = AmazonItem()
     lightning_deal_url = response.xpath(
         "//a[@title='View Offer']/@href").extract()
     if len(lightning_deal_url) == 1:
         print lightning_deal_url[0]
         self.scraped_product_count -= 1
         lightning_deal_url = "https://www." + self.allowed_domains[
             0] + lightning_deal_url[0]
         yield Request(lightning_deal_url,
                       callback=self.parse_page_product,
                       meta={'asin': asin})
     else:
         new_response_body = response.body.partition("Deal Price:")
         if len(new_response_body[1]) == 0:
             new_response_body = response.body.partition("Sale:")
         #print new_response_body[1]
         if len(new_response_body[1]) == 0:
             new_response_body = response.body.partition("Price:")
         if len(new_response_body[1]) != 0:
             price_text = new_response_body[2].replace(",", "")
             priceRegex = re.compile(r"(?<=\>)\s*\d+\.\d+|\d+(?=\<)")
             price_match = priceRegex.search(price_text)
             if price_match:
                 price = price_match.group(0)
                 product_price = price.replace(" ", "")
                 item["name"] = self.productDataDict[asin]["name"]
                 item["price"] = product_price
                 old_price = self.productDataDict[asin]["price"]
                 if len(old_price) > 0:
                     new_price = round(float(product_price), 2)
                     old_price = round(float(old_price), 2)
                     if new_price < old_price:
                         self.send_mail(
                             old_price, new_price, asin,
                             self.productDataDict[asin]["user_mail_id"],
                             self.productDataDict[asin]["name"])
                         #print "Price Dropped!"
                 #print asin + '\t=======>\t' + str(product_price)
                 self.productDataDict[asin]["price"] = product_price
                 if self.scraped_product_count == len(self.productDataDict):
                     #print "inside file close1"
                     self.fileObject.seek(0)
                     self.fileObject.truncate()
                     json.dump(self.productDataDict, self.fileObject)
                     self.fileObject.close()
                 #item["url"] =
                 yield item
             else:
                 self.productDataDict[asin]["url"] = ""
                 if self.scraped_product_count == len(self.productDataDict):
                     #print "inside file close2"
                     self.fileObject.seek(0)
                     self.fileObject.truncate()
                     json.dump(self.productDataDict, self.fileObject)
                     self.fileObject.close()
                     print 'Failed to get the price of the product'
예제 #8
0
파일: cameras.py 프로젝트: sundarpy/Bitshop
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     items = []
     item = AmazonItem()
     item['title'] = hxs.select(
         '//div[@class="a-section a-spacing-none"]/h1/span[@id="productTitle"]/text()'
     ).extract()
     item['brand'] = hxs.select('//a[@id="brand"]/text()').extract()
     item['specs'] = hxs.select(
         '//div[@class="pdTab"][1]//node()').extract()
     item['offerprice'] = hxs.select(
         '//span[@id="priceblock_ourprice"]/text()').extract()
     item['saleprice'] = hxs.select(
         '//span[@id="priceblock_saleprice"]/text()').extract()
     item['description'] = hxs.select(
         '//div[@id="productDescription"]//text()').extract()
     item['feature'] = hxs.select(
         '//ul[@class="a-vertical a-spacing-none"]/li/span/text()').extract(
         )
     item['image'] = hxs.select(
         '//span[@class="a-button-text"]/img/@src').extract()
     item['link'] = response.meta["url"]
     item['seller'] = hxs.select(
         '//div[@id="merchant-info"]/a[1]/text()').extract()
     item['sellrating'] = hxs.select(
         '//div[@id="merchant-info"]/text()').extract()
     item['starating'] = hxs.select(
         '//a[@class="a-link-normal"]/i/span/text()').extract()[0]
     item['COD'] = "Available"
     item['category'] = "Cameras, Audio & Video"
     item['subcategory'] = "Home Audio & Video Accessories"
     items.append(item)
     return items
예제 #9
0
파일: books.py 프로젝트: sundarpy/Bitshop
 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     items = []
     item = AmazonItem()
     item['title'] = hxs.select(
         '//div[@class="a-section a-spacing-none"]/h1/span[@id="productTitle"]/text()'
     ).extract()
     item['brand'] = hxs.select('//a[@id="brand"]/text()').extract()
     item['specs'] = hxs.select(
         '//div[@class="pdTab"][1]//node()').extract()
     item['offerprice'] = hxs.select(
         '//span[@class="a-size-medium a-color-price inlineBlock-display offer-price a-text-normal price3P"]/text()'
     ).extract()
     item['saleprice'] = hxs.select(
         '//span[@id="priceblock_saleprice"]/text()').extract()
     item['description'] = hxs.select(
         '//div[@id="productDescription"]//text()').extract()
     item['feature'] = hxs.select(
         '//ul[@class="a-vertical a-spacing-none"]/li/span/text()').extract(
         )
     item['image'] = hxs.select(
         '//div[@id="img-canvas"]/img/@src').extract()
     item['link'] = response.meta["url"]
     item['seller'] = hxs.select(
         '//div[@id="merchant-info"]/a[1]/text()').extract()
     item['sellrating'] = hxs.select(
         '//div[@id="merchant-info"]/text()').extract()
     item['starating'] = hxs.select(
         '//a[@class="a-link-normal"]/i/span/text()').extract()[0]
     item['COD'] = "Available"
     item['category'] = "Books"
     item['subcategory'] = "Tamil"
     items.append(item)
     return items
예제 #10
0
    def parse(self, response):
        item = AmazonItem()
        for data in response.css(
                '.s-include-content-margin .sg-row , .a-color-base.a-text-normal'
        ):
            # print(data)
            # yield data
            item['name'] = str(
                data.css('span.a-color-base.a-text-normal::text').get())
            item['author'] = str(
                data.css('.a-color-secondary .a-size-base.a-link-normal::text'
                         ).get()).strip().split('\n')[0]
            item['price'] = str(
                data.css('.a-spacing-top-small .a-price-whole::text').get())
            if (item['name'] != "None" and item['author'] != "None"
                    and item['price'] != "None"):
                yield item
                print("---" * 50)
            else:
                pass

        next_page = 'https://www.amazon.com/s?i=specialty-aps&srs=17276793011&page=' + str(
            QuotesSpider.number) + '&qid=1570782228&ref=lp_17276793011_pg_2'
        if (QuotesSpider.number <= 10):
            QuotesSpider.number += 1
            yield response.follow(next_page, callback=self.parse)
예제 #11
0
    def parse_item(self, response):

        print('------------------parse_item-----------------------')

        item = AmazonItem()

        #qa_list  = response.xpath("//div[starts-with(@id,'question-')]")

        #for info in qa_list:

        question = response.xpath(
            "normalize-space(//div[@class='cdQuestionText']/text())"
        ).extract_first()
        answer = response.xpath(
            "normalize-space(//div[@class='cdMessageInfo']/span[contains(@id,'cdPostContentBox_')]/text())"
        ).extract()

        #question = info.xpath("normalize-space(div[@class='a-fixed-left-grid-inner']/div[@class='a-fixed-left-grid-col a-col-right']/a[@class='a-link-normal']/text())").extract()
        #answer = response.xpath("normalize-space(//div[@class='cdMessageInfo']/span[contains(@id,'cdPostContentBox_')]/text())").extract()

        #question_url = info.xpath("normalize-space(div[@class='a-fixed-left-grid-inner']/div[@class='a-fixed-left-grid-col a-col-right']/a[@class='a-link-normal']/@href)").extract_first()
        #print(question_url)

        item['question'] = question
        item['answer'] = answer

        yield item
예제 #12
0
    def parse_item(self, response):
        item = AmazonItem()
        item['name'] = response.xpath(
            '//span[@id="productTitle"]/text()').extract_first()
        if item['name'] is None:
            item['name'] = response.xpath(
                '//span[@id="ebooksProductTitle"]/text()').extract()
            item['name'] = "".join([i.strip() for i in item['name']])
            item['author'] = response.xpath(
                '//div[@id="bylineInfo"]//a/text()').extract()
            item['author'] = (',').join(
                [i.strip() for i in item['author'] if i.strip()])
            item['comments'] = response.xpath(
                '//a[@id="cmrs-atf"]/text()').extract()
            item['img'] = response.xpath(
                '//div[@id="ebooksImageBlockContainer"]//img/@data-a-dynamic-image'
            ).extract()
            item['pub_date'] = response.xpath(
                '//div[@class="buying"]/span[2]/text()').extract_first()
            item['price'] = response.xpath(
                '//span[@class="a-color-price"]//text()').extract_first()
            if item['price'] is None:
                item['price'] = response.xpath(
                    '//span[@class="a-size-base a-color-price a-color-price"]//text()'
                ).get()
            item['price'] = item['price'].strip()
            item['comments'] = response.xpath(
                '//span[@id="acrCustomerReviewText"]/text()').get()
        else:
            item['pub_date'] = response.xpath(
                '//h1[@id="title"]//span/text()').extract()
            item['pub_date'] = re.sub(' ', '', "".join(item['pub_date']))
            item['author'] = response.xpath(
                '//div[@id="bylineInfo"]//a/text()').extract()
            item['author'] = (',').join(
                [i.strip() for i in item['author'] if i.strip()])
            item['price'] = response.xpath(
                '//span[@class="a-size-base a-color-price a-color-price"]//text()'
            ).get()
            if not item['price']:
                item['price'] = response.xpath(
                    '//span[contains(@class,"a-size-base")]/text()').get()
            item['price'] = item['price'].strip()
            item['comments'] = response.xpath(
                '//span[@id="acrCustomerReviewText"]/text()').get()

            item['img'] = response.xpath(
                '//div[@id="imageBlockContainer"]//img/@data-a-dynamic-image'
            ).extract()
        item['cate'] = response.xpath(
            '//ul[@class="a-unordered-list a-horizontal a-size-small"]//span[@class="a-list-item"]//text()'
        ).extract()
        item['cate'] = ">".join([i.strip() for i in item['cate']])
        item['url'] = response.url
        item['version'] = response.xpath(
            '//li[@class="swatchElement selected"]//a/span/text()'
        ).extract_first()
        yield item
예제 #13
0
 def parse(self, response):
     items = AmazonItem()
     title = response.xpath('//h1[@id="title"]/span/text()').extract()
     sale_price = response.xpath(
         '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
     ).extract()
     items['product_name'] = ''.join(title).strip()
     items['product_sale_price'] = ''.join(sale_price).strip()
     yield items
예제 #14
0
 def parse(self, response):
 items = AmazonItem()
 title = response.xpath('//h1[@id="title"]/span/text()').extract()
 sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract()
 category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract()
 availability = response.xpath('//div[@id="availability"]//text()').extract()
 items['product_name'] = ''.join(title).strip()
 items['product_sale_price'] = ''.join(sale_price).strip()
 items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip()
 items['product_availability'] = ''.join(availability).strip()
예제 #15
0
 def parse_items(self, response):
     items = AmazonItem()
     items['title'] = response.xpath(
         './/*[@id="productTitle"]/text()').extract()
     items['features'] = response.xpath(
         './/*[@id="feature-bullets"]/ul//li/span/text()').extract()
     items['product_url'] = response.url
     items['image_urls'] = response.xpath(
         './/*[@id="landingImage"]/@src').extract()
     yield items
예제 #16
0
 def parse(self, response):
     print ">>>>>", response.request.url
     sel = Selector(response)
     items = []
     item = AmazonItem()
     item['url'] = response.request.url
     #import pdb;pdb.set_trace()
     #item['href'] = sel.xpath('//div[@class="a-row a-spacing-none"]/a[@class="a-link-normal a-text-normal"]/@href').extract()
     title = str(
         list(
             map(unicode.strip,
                 sel.xpath('//span[@id="productTitle"]/text()').extract())))
     title = title.replace("u'", "").replace("[", "").replace("]", "")
     item['title'] = title
     brand = str(
         list(
             map(unicode.strip,
                 sel.xpath('//a[@id="brand"]/text()').extract())))
     brand = brand.replace("u'", "").replace("[", "").replace("]", "")
     item['brand'] = brand
     import pdb
     pdb.set_trace()
     price = str(
         list(
             map(
                 unicode.strip,
                 sel.xpath(
                     '//span[@class="olp-padding-right"]//span[@class="a-color-price"]//text()'
                 ).extract())))
     price = price.replace("Rs.", "").replace("[", "").replace(
         "]", "").replace("u'",
                          "").replace(",", "").replace("'",
                                                       "").replace("$", "")
     item['price'] = float(price) * 67.30
     desc = str(
         list(
             map(
                 unicode.strip,
                 sel.xpath(
                     '//div/ul[@class="a-vertical a-spacing-none"]/li//text()'
                 ).extract())))
     desc = desc.replace("[", "").replace("]", "").replace("u'", "")
     item['desc'] = desc
     category = str(
         list(
             map(
                 unicode.strip,
                 sel.xpath(
                     '//div[@data-feature-name="wayfinding-breadcrumbs"]/ul/li//a//text()'
                 ).extract())))
     category = category.replace("u'", "").replace("[", "").replace(
         "]", "").replace('"', '').replace("'", "")
     item['category'] = category
     if item['title'] and item['category'] and item['price']:
         return item
예제 #17
0
 def parse(self, response):
     book = AmazonItem()
     book['books_name'] = response.xpath('//div[@class="p13n-sc-truncate p13n-sc-line-clamp-1"]/text()').extract()
     book['author'] = response.xpath('//span[@class="a-size-small a-color-base"]/text()').extract()
     book['price'] = response.xpath('//span[@class="p13n-sc-price"]/text()').extract()
     book['books_link'] = response.xpath('//a[@class="a-link-normal a-text-normal"]/@href').extract()
     yield book
     for i in range(1, 2):
         url = 'https://www.amazon.ca/Best-Sellers-Books/zgbs/books/ref=zg_bs_pg_' + str(i) + \
              '?_encoding=UTF8&pg=' + str(i)
         yield Request(url=url, callback=self.parse)
    def parse(self, response):
        #// *[ @ id = "reviewer1"] / td[3] / a
        for reviewer in response.xpath(
                '//tr[contains(@id, "reviewer1")]/td[3]/a'):
            name = reviewer.xpath('b/text()').extract()
            href = reviewer.xpath('@href').extract()

            rev_url = 'http://www.amazon.com' + href[0]

            self.driver.get(rev_url)
            rev_id = rev_url.split('/')[-1]
            if rev_id == '':
                rev_id = response.url.split('/')[-2]

            usr_xpath = '//a[@id="/gp/profile/' + rev_id + '"]'
            see_more_xpath = '//a[@class="a-declarative"]'
            # email_xpath = '//*[@id="a-page"]/div[2]/div/div[1]/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div/div[1]/div[2]/a'

            email_xpath = '//span[contains(@class, "a-size-small a-color-link break-word pr-show-email")]'
            email = ''

            temp_cookies = self.driver.get_cookies()
            with open(self.cookie_file_name, 'w') as buffers:
                json.dump(temp_cookies, buffers)

            #// *[ @ id = "a-page"] / div[2] / div / div[1] / div / div / div / div[2] / div / div[1] / div / span
            #//span[contains(@class, "public-name-text")]
            eny_id = '-'
            try:
                eny_id = re.compile(r'.*\/profile\/(\w+)').search(
                    rev_url).group(1)
                email_attempt = self.email_fetch(eny_id)
                print 'email_attempt@@@@@@@@@@@: ' + email_attempt
            except:
                email = '-'

            # /gp/profile/A1WPFIZ8P3O86V
            sel = scrapy.Selector(text=self.driver.page_source)

            if email != '-':
                email = sel.xpath(usr_xpath + '/text()').extract()[0]

            item = AmazonItem()
            item['name'] = name
            item['email'] = email
            item['idstr'] = eny_id
            yield item

        self.i += 1
        if self.i <= self.end:
            yield scrapy.Request(
                'http://www.amazon.com/review/top-reviewers?page=' +
                str(self.i),
                callback=self.parse)
예제 #19
0
 def parse(self, response):
     items = AmazonItem()
     title = response.xpath('//*[@id="dealTitle"]/span/text()').extract()
     sale_price = response.xpath(
         '//*[@id="100_dealView_0"]/div/div[2]/div/div/div[3]/div[1]/span/text()'
     ).extract()
     # category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract()
     # availability = response.xpath('//div[@id="availability"]//text()').extract()
     items['product_deal'] = ''.join(title).strip()
     items['product_sale_price'] = ''.join(sale_price).strip()
     # items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip()
     # items['product_availability'] = ''.join(availability).strip()
     yield items
 def get_next_page_data(self, response):
     item = AmazonItem()
     product_category = response.meta['product_category']
     product_link_list = response.xpath(
         '//ol[@id="zg-ordered-list"]/li/span/div/span/a/@href').extract()
     for link in product_link_list:
         item['product_url'] = self.host + link.split('?')[0]
         item['product_url_page_num'] = item['product_url'].split('/')[-1]
         item['product_category'] = product_category
         item['update_time'] = datetime.datetime.now().strftime(
             "%Y-%m-%d %H:%M:%S")
         item['error'] = 'yes'
         yield item
예제 #21
0
    def next(self, response):
        print response.url
        item = AmazonItem()
        
        item['url'] = response.url
        print item['url']
        item['title'] = response.xpath('//span[id="productTitle"]').extract()
        print item['title']
        item['price'] = response.xpath('//span[@id="priceblock_ourprice"]/text()').extract()
        
        print item['price']

        yield item
예제 #22
0
    def parse(self, response):
        Link = response.css('.a-text-normal').css('a::attr(href)').extract()
        Title = response.css('span.a-text-normal').css('::text').extract()

        # for each product, create an AmazonItem, populate the fields and yield the item
        for result in zip(Link, Title):
            item = AmazonItem()
            item['title_Product'] = result[1]
            item['link_Product'] = result[0]
            # extract ASIN from link
            ASIN = re.findall(r"(?<=dp/)[A-Z0-9]{10}", result[0])[0]
            item['ASIN_Product'] = ASIN
            item['url_response'] = response.url
            yield item
예제 #23
0
 def detail_parse(self,response):
     item = AmazonItem()
     result = response.xpath('//div[@id="centerCol"]')
     brand = result.xpath('.//a[@id="bylineInfo"]/text()').extract()[0]
     price = result.xpath('.//span[@id="priceblock_ourprice"]/text()').extract()[0]
     desc = result.xpath('.//span[@id="productTitle"]/text()').extract()[0].strip()
     shop = result.xpath('.//span[@id="ddmMerchantMessage"]/a/text()').extract()[0]
     shop_url = urljoin("https://www.amazon.cn",result.xpath('.//span[@id="ddmMerchantMessage"]/a/@href').extract()[0])
     storage_lst = ["brand","price","desc","shop","shop_url"]
     data_lst = [brand,price,desc,shop,shop_url]
     for i in range(len(data_lst)):
         item[storage_lst[i]] = data_lst[i]
     # print(item)
     return item
    def parse(self, response):
        con = cx_Oracle.connect('Dhiren/[email protected]/xe')
        cursor = con.cursor()
        items = AmazonItem()
        extractor = LinkExtractor(allow_domains='amazon.in')
        links = extractor.extract_links(response)
        #  title = response.xpath('//*[@id="rev-dpReviewsMostHelpfulAUI-R1GC3CZRYZE7LY"]/div[1]/div/a[2]/span').extract()
        #  sale_price = response.xpath('//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()').extract()
        #  category = response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract()
        #  availability = response.xpath('//div[@id="availability"]//text()').extract()
        #review = response.xpath('//*[@id="revData-dpReviewsMostHelpfulAUI-R2BCSE4PLKSH7M"]/div/text()').extract()
        #y = 0
        #  s = response.xpath('//*[@id="productTitle"]/text()').extract
        #  print (s)
        #	for link in links:
        #		print link.url
        for link in response.xpath('//*[@id="revMHRL"]/div'):

            #	if "J7-Prime" in response.url:
            #   items['product_review_j7'] = link.xpath('.//div[@class="a-section"]/text()').extract()
            #   cursor.execute("insert into reviews(review) values ('" + ''.join(link.xpath('.//div[@class="a-section"]/text()').extract())+ "')")
            t = ''.join(link.xpath('.//div/div/a[2]/span/text()').extract())
            r = ''.join(
                link.xpath('.//div[@class="a-section"]/text()').extract())
            print(t)
            cursor.execute("insert into reviews(review) values ('" + r + "')")
            con.commit()

        cursor.close()
        con.close()


#	else:
#		items['product_review_navy'] = link.xpath('.//div[@class="a-section"]/text()').extract()
#items['product_review'] = ''.join(review).strip()
#items['y'] = link.xpath('.//div[@class="a-section"]/text()').extract()
#   yield items
#y = link.xpath('@href').extract()
#print y

#  items['product_name'] = ''.join(title).strip()
#  items['product_sale_price'] = ''.join(sale_price).strip()
#  items['product_category'] = ','.join(map(lambda x: x.strip(), category)).strip()
#  items['product_availability'] = ''.join(availability).strip()

#//*[@id="revData-dpReviewsMostHelpfulAUI-R2BCSE4PLKSH7M"]/div
#//*[@id="revData-dpReviewsMostHelpfulAUI-R1GC3CZRYZE7LY"]/div

#///*[@id="revMH"]
예제 #25
0
파일: book1.py 프로젝트: daringBuaa/Spider
 def parse(self, response):
     # 提取大分类:分组、遍历
     li_list = response.xpath(
         "//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-one']/div/li"
     )
     for li in li_list:
         item = AmazonItem()
         item["main_title"] = li.xpath(".//a/span/text()").extract_first()
         item["main_title_url"] = li.xpath(".//a/@href").extract_first()
         item["main_title_url"] = urljoin(response.url,
                                          item["main_title_url"])
         # 请求列表页
         yield scrapy.Request(item["main_title_url"],
                              meta={"item": deepcopy(item)},
                              callback=self.vice_title_list)
예제 #26
0
파일: book.py 프로젝트: daringBuaa/Spider
 def parse_vice_title(self, response):
     item = AmazonItem()
     li_list = response.xpath(
         "//ul[@class='a-unordered-list a-nostyle a-vertical s-ref-indent-two']/div/li"
     )
     for li in li_list:
         item["vice_title"] = li.xpath(
             "./span/a/span/text()").extract_first()
         item["vice_title_url"] = "https://www.amazon.cn" + li.xpath(
             "./span/a/@href").extract_first()
         item["vice_title_url"] = "https://www.amazon.cn" + li.xpath(
             "./span/a/@href").extract_first()
         yield scrapy.Request(item["vice_title_url"],
                              meta={"item": deepcopy(item)},
                              callback=self.books_detail)
예제 #27
0
 def parse(self, response):
     ml_item = AmazonItem()
     ml_item['ASIN'] = response.xpath(
         'normalize-space(//th[contains(.,"ASIN")]//following-sibling::td/text())'
     ).extract()  ### .com
     # ml_item['Title'] = response.xpath('normalize-space(//span[contains(@id,"productTitle")]/text())').extract()
     # ml_item['Description'] = response.xpath('normalize-space(//div[contains(@id,"productDescription")]/div/p[1]/text())').extract()
     ml_item['Price'] = response.xpath(
         'normalize-space(//span[contains(@id,"priceblock_ourprice")]/text())'
     ).extract()
     ml_item['List_price'] = response.xpath(
         'normalize-space(//span[contains(@class,"a-text-strike")]/text())'
     ).extract()
     #ml_item['Image_URL'] = response.xpath('normalize-space(//div[contains(@id,"imgTagWrapperId")]/img/@data-old-hires)').extract()
     yield ml_item
예제 #28
0
    def parse_detail(self, response):
        item = AmazonItem()

        # 获取手机名称
        name = response.xpath(
            '//span[@id="productTitle"]/text()').extract_first().strip()

        # 获取价格
        price = response.xpath(
            '//span[@id="priceblock_ourprice"]/text()').extract_first()

        # 提交数据
        if name is not None and price is not None:
            item["name"] = name
            item["price"] = price
            yield item
예제 #29
0
    def parse(self, response):

        #爬取
        productname = response.xpath(
            "normalize-space(//h1[@id='title']/span[@id='productTitle']/text())"
        ).extract()
        description = response.xpath(
            "//div[@id='featurebullets_feature_div']/div[@id='feature-bullets']/ul[@class='a-unordered-list a-vertical a-spacing-none']/li/span[@class='a-list-item']/text()"
        ).extract()

        item = AmazonItem()

        item['productname'] = productname
        item['description'] = description

        yield item
예제 #30
0
 def parse_mobile(self,response):
     #url和brand
     item = AmazonItem()
     url=response.url
     print "-----------------amazon--------------"
     print url
     brand=response.meta['brand']
     item['brand']=brand
     item['url']=url
     #_id
     aa = dict([(k, v[0]) for k, v in urlparse.parse_qs(urlparse.urlparse(url).query).items()])
     qid = []
     if aa.has_key('qid'):
         qid = aa['qid']
     count=response.xpath('//*[@id="acrCustomerReviewText"]/text()').extract_first()
     if count:
         count = count.encode("GBK", "ignore")
         end = count.find(' ')
         id = qid + count[0:end]
     else:
         id=qid+'0'
     item['_id']=id
     #型号
     attribute_list=response.xpath('//*[@id="prodDetails"]//div[1]/div/div[2]/div/div/table/tbody/tr/td[1]/text()').extract()
     value_list=response.xpath('//*[@id="prodDetails"]//div[1]/div/div[2]/div/div/table/tbody/tr/td[2]/text()').extract()
     len_list=len(attribute_list)
     temp='型号'
     temp=temp.encode("GBK", "ignore")
     item['model']=''
     for i in xrange(len_list):
         attribute = attribute_list[i].encode("GBK", "ignore")
         if attribute == temp:
             model = value_list[i]
             item['model'] = model
     #phone_name
     phone_name=brand+' '+item['model']
     item['phone_name']=phone_name
     #平均分
     average_score = response.xpath('//*[@id="summaryStars"]/a/i/span/text()').extract_first()
     item['average_score'] = average_score
     #进入评论页
     review_list = []
     review_url=response.xpath('//*[@id="revF"]/div/a/@href').extract_first()
     if review_url:
         url = review_url + "&pageNumber=1"
         yield scrapy.Request(url,meta={'review_list':review_list,'item':item},callback=self.parse_review)