Python HtmlXPathSelector Exemples, scrapy.selector.HtmlXPathSelector Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : video_detail.py Projet : hackrole/scrapy-utils

    def parse_letv_com(self, response):
        hxs = HtmlXPathSelector(response)

        pid = hxs.re('pid:(\d+)')[0]
        vid = hxs.re('vid:(\d+)')[0]
        mid = hxs.re('mmsid:(\d+)')[0]
        # the pv
        url_t = "http://stat.letv.com/vplay/queryMmsTotalPCount?callback=&cid=1&vid=%s&mid=%s&pid=%s"
        #print "<<<<<<<<<<<<<<<<<<<<<<<<<<<"
        #print pid, vid, mid
        url = url_t % (vid, mid, pid)
        text = urllib.urlopen(url).read()
        pv = re.findall('media_play_count.*?(\d+)', text)[0]
        up = 0
        down = 0

        # the comments count
        url_tt = "http://api.my.letv.com/vcm/api/g?jsonp=&type=video&notice=1&pid=%s&xid=%s&mmsid=%s&rows=10&page=1"
        url2 = url_tt % (pid, vid, mid)
        text2 = urllib.urlopen(url2).read()
        comments = re.findall('total.*?(\d+)', text2)[0]

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Exemple #2

0

Afficher le fichier

Fichier : shipinshangwu_company.py Projet : hackrole/scrapy-utils

    def parse_list_page(self, response):
        multi_xpath = '//div[@class="supply-cell" or @class="supply-cell supply-cell-bg"]'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            shop_name = ''.join(hxs.select('./div/div/span/a//text()').extract())
            shop_name = clean_string(shop_name)
            shop_site_url = ''.join(hxs.select('./div/div/span/a[1]/@href').extract())
            shop_site_url = urllib.unquote(shop_site_url).strip()

            detail_url = shop_site_url

            doc = {
                'shop_name': shop_name,
                'shop_site_url': shop_site_url,
            }

            query = response.meta['query']
            list_url = response.url

            if not shop_site_url:
                next_request = None
            else:
                headers = {
                    'referer': shop_site_url
                }
                next_request = Request(detail_url, headers=headers, callback=self.parse_about_page)
            item = LegItem(collection=self.collection, doc=doc,
                           next_request=next_request, list_url=list_url, query=query)
            yield self.item_or_request(item)

Exemple #3

0

Afficher le fichier

Fichier : csi.py Projet : colehead/DisasterAnalysis

    def item_parse(self, response):
        hxs = HtmlXPathSelector(text=response.body)
        page_ele = hxs.select('//div[@class="memo"]/text()')
        if len(page_ele) > 0:
            text = page_ele[0].extract()
            result = re.search(ur"北京时间(.+) 在(.+)\((.+),(.+)\) 发生(.+)级地震，震源深度(.+)公里", text, re.UNICODE)
            #print result.group(1), result.group(2), result.group(3), result.group(4), result.group(5), result.group(6)

            epoch = time.mktime(time.strptime(result.group(1), "%Y-%m-%d %H:%M"))
            if (epoch < self.last_timestamp):
                return

            name = result.group(2)
            magnitude = result.group(5)
            depth = result.group(6)

            latitude_re = re.search(r"([^\d]+)(\d+\.\d+)", result.group(3))
            latitude = float(latitude_re.group(2)) if latitude_re.group(1) == u"北纬" else 0 - float(latitude_re.group(2))
            longtitude_re = re.search(r"([^\d]+)(\d+\.\d+)", result.group(4))
            longtitude = float(longtitude_re.group(2)) if longtitude_re.group(1) == u"东经" else 0 - float(longtitude_re.group(2))
            #print latitude, longtitude

            db_file = self.db_file
            conn = sqlite3.connect(db_file)
            c = conn.cursor()
            print "insert %s to %f, %f\n" % (name, latitude, longtitude)
            c.execute("INSERT OR REPLACE INTO quake (name, longtitude, latitude, timestamp, depth, magnitude, source_url) VALUES (?,?,?,?,?,?,?)", 
                        [name, longtitude, latitude, epoch, depth, magnitude, response.url]
                     )

            conn.commit()

Exemple #4

0

Afficher le fichier

Fichier : video_detail.py Projet : hackrole/scrapy-utils

    def parse_sohu_com(self, response):
        hxs = HtmlXPathSelector(response)

        vid = ''.join(hxs.re('var vid="(\d+)')).strip()
        pid = ''.join(hxs.re('var playlistId="(\d+)')).strip()
        cid = ''.join(hxs.re('var cid="(\d+)')).strip()
        # msg = "sohu id: vid %s, pid %s, cid %s" % (vid, pid, cid)
        # self.log(msg)

        url_t = "http://count.vrs.sohu.com/count/stat.do?videoId=%s&playlistId=%s&categoryId=%s"
        url1 = url_t % (vid, pid, cid)
        text = urllib.urlopen(url1).read()
        pv = ''.join(re.findall('(\d+)', text))

        url_t1 = "http://score.my.tv.sohu.com/digg/get.do?vid=%s&type=%s"
        url1 = url_t1 % (vid, cid)
        text = urllib.urlopen(url1).read()
        t = text[text.find('{'): text.rfind('}') + 1]
        dj = json.loads(t)
        up = dj['upCount']
        down = dj['downCount']

        url_t2 = "http://access.tv.sohu.com/reply/list.do?objid=%s&subobjid=%s&objtype=%s"
        url2 = url_t2 % (pid, vid, cid)
        text = urllib.urlopen(url2).read()
        comments = re.findall('"allCount":(\d+)', text)[0]

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Exemple #5

0

Afficher le fichier

Fichier : pd_mobiles.py Projet : satishkt/bpp-india

    def detail(self, response):
        log.msg(response.url)
        hxs = HtmlXPathSelector(response)
        variants_price=hxs.select("//div[@class='fleft catbox pricerate']//span/text()").extract()
        variants_seller=hxs.select("//div[@class='catbox fleft storeimage']/img/@alt").extract()
        quantitylist=[]
        pricelist=[]
        items=[]


        if (len(variants_price)!=0 or variants_price!=None) and (len(variants_seller) or  variants_seller!=None):
            for price, seller in zip(variants_price, variants_seller):
                item = BillionPricesIndiaItem()
                item['date'] = time.strftime("%d/%m/%Y")
                item['vendor'] = seller.split(" ")[-1:][0]
                item['product'] = response.url.split('/')[-1].split(".")[0]
                itemprice=re.sub('[,]', '', price).split(" ")[-1:][0]
                item['category'] = "mobiles"
                item['price'] = float(itemprice)
                item['quantity'] = '1'
                item['measure']= 'pcs'
                item['unitprice']=float(itemprice)

                items.append(item)
        return items

Exemple #6

0

Afficher le fichier

Fichier : ikea_spider.py Projet : astephen2/ikea-translate

	def parse(self,response):
		#Get Access Token for Microsoft Translate
	
		
		atrequest = urllib2.Request('https://datamarket.accesscontrol.windows.net/v2/OAuth2-13')
		atrequest.add_data(atdata)
		atresponse = urllib2.urlopen(atrequest)
		access_token = json.loads(atresponse.read())['access_token']
		
		hxs = HtmlXPathSelector(response) 
		sites = hxs.select('//span[contains(@class, "productsAzLink")]/a/text()').extract()
		items = []
		
		for site in sites:
			text = []
			item = IkeaItem()
			item['name'],_,item['thing'] = unicode(site).partition(' ')
			
			tosend = {'text': unicode(item['name']), 'from' : 'sv' , 'to' : 'en' }
			request = urllib2.Request('http://api.microsofttranslator.com/v2/Http.svc/Translate?'+urllib.urlencode(tosend))
			request.add_header('Authorization', 'Bearer '+access_token)
			response = urllib2.urlopen(request)
			doc = etree.fromstring(response.read())
			
			for elem in doc.xpath('/foo:string', namespaces={'foo': 'http://schemas.microsoft.com/2003/10/Serialization/'}):
				if elem.text:
					elem_text = ' '.join(elem.text.split())
					if len(elem_text) > 0:
						text.append(elem_text)
		
			item['translation'] = ' '.join(text)
			items.append(item)
		return items

Exemple #7

0

Afficher le fichier

Fichier : usp.py Projet : cghackspace/destelado

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        for div in hxs.select('//div[@id="contem_boxes"]'):
            titulo = div.select('.//div[@id="contem_titulo"]/text()').extract()[0]

            if not titulo.endswith(u'mara dos Deputados/BR'):
                continue
            else:
                reg = re.compile('<a class="listapar" href="(?P<url>.*?)">(?P<name>[\w\s]*[\w]+)\s*\(<b>[\w\s]+</b>\)\s-\s(?P<party>.*?)\/(?P<state>.*?)</a><br>', flags=re.U)
                for r in reg.finditer(div.extract()):
                    dict_deputy = r.groupdict()
                    #if dict_deputy['state'] in settings['STATE_TO_FILTER']:
                    db_deputy = self.api.get_deputado_por_nome(dict_deputy['name'])
                    if not db_deputy:
                        dep = Deputado(dict_deputy['name'], dict_deputy['state'], dict_deputy['party'])
                        self.api.inserir_deputado(dep)
                    else:
                        dep = db_deputy[0]

                    id = urlparse.parse_qs(urlparse.urlparse(dict_deputy['url']).query).get('id', [0])[0]
                    if not id:
                        continue
                    request = Request(urljoin(self.base_url, '@presencas.php?id=%s' % id), callback=self.parse_deputy_assiduity)
                    request.meta['dep'] = dep
                    yield request
                    
                    request = Request(urljoin(self.base_url, '@uso_verbas_als.php?uf=16&id=%s' % id), callback=self.parse_deputy_costs)
                    request.meta['dep'] = dep
                    yield request

Exemple #8

0

Afficher le fichier

Fichier : evenstadmusikk.py Projet : 0--key/lib

    def browse_and_parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        for subcat_href in hxs.select('//div[@id="navColumnOne"]//a/@href').extract():
            subsubcat_url = urlparse.urljoin(base_url, subcat_href)
            if subsubcat_url not in self.navig_url_set:
                self.navig_url_set.add(subsubcat_url)
                yield Request(subsubcat_url, callback=self.browse_and_parse)

        next_page = hxs.select("//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href")
        if next_page:
            yield Request(next_page[0].extract(), callback=self.browse_and_parse)

        # parse product listing in this page, if any
        for tr in hxs.select('//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'):
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_value('url', tr.select(".//td[2]//a/@href").extract()[0])
            product_loader.add_value('name', tr.select(".//td[2]//a/text()").extract()[0])
            product_loader.add_value('price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()

        # edge case: product listing page with a single product
        product_price = hxs.select('//h2[@id="productPrices"]/text()').extract()
        if product_price:
            # this product listing page contains a single product
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_xpath('name', '//h1[@id="productName"]/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value('price', product_price[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()

Exemple #9

0

Afficher le fichier

Fichier : url_spider.py Projet : Krsms/VerticleSearchEngine

   def parse(self, response):
      connection = pymongo.MongoClient("localhost", 30000)
      db = connection.academic
      hps = db.homepages
      tmp = hps.find_one({"url": response.url})
      if not tmp:
         hxs = HtmlXPathSelector(response)
         urls = hxs.select('//a')
         contents = hxs.select('//p | //a | //b | //tr | //td | //li | //ul | //font | //span | //strong | //h1 | //h2 | //h3')
      
         link = []
         text = ""

         for url in urls:
            u = ''.join(url.select('@href').extract())
            if u[-4:] == ".pdf":
               link.append(u)

         for content in contents:
            s = ''.join(content.select('text()').extract())
            if len(s) > 3:
               text += s

      
         hp = {
            "url" : response.url,
            "link" : link,
            "text" : text
         }
         print "[insert]"
         hps.insert(hp)
      else:
         print "[redundent]"

Exemple #10

0

Afficher le fichier

Fichier : tastykitchen_spider.py Projet : eleclerc/openrecipes

    def parse_item(self, response):

        # we use this to run XPath commands against the HTML in the response
        hxs = HtmlXPathSelector(response)

        # this is the base XPath string for the element that contains the recipe
        # info
        base_path = """//div[@class="recipe-details"]"""

        # the select() method will return a list of HtmlXPathSelector objects.
        # On this site we will almost certainly either get back just one, if
        # any exist on the page
        recipes_scopes = hxs.select(base_path)

        # it's easier to define these XPath strings outside of the loop below
        name_path = '//h1[@itemprop="name"]/text()'
        recipeYield_path = '//label[@for="set_servings"]/input/@value'
        description_path = '//span[@itemprop="summary"]/p/text()'
        image_path = '//img[@class="the_recipe_image"]/@src'
        cookTime_path = '//form/p/time[@itemprop="cookTime"]/@datetime'
        prepTime_path = '//form/p/time[@itemprop="prepTime"]/@datetime'
        ingredients_path = '//span[@itemprop="ingredient"]'
        ingredients_amounts_path = './span[@itemprop="amount"]/text()'
        ingredients_names_path = './span[@itemprop="amount"]/text()'
        datePublished_path = '//span[@itemprop="published"]/@datetime'

        # init an empty list
        recipes = []

        # loop through our recipe scopes and extract the recipe data from each
        for r_scope in recipes_scopes:
            # make an empty RecipeItem
            item = RecipeItem()

            item['source'] = self.source

            item['name'] = r_scope.select(name_path).extract()
            item['image'] = r_scope.select(image_path).extract()
            item['description'] = r_scope.select(description_path).extract()
            item['url'] = response.url
            item['prepTime'] = r_scope.select(prepTime_path).extract()
            item['cookTime'] = r_scope.select(cookTime_path).extract()
            item['recipeYield'] = r_scope.select(recipeYield_path).extract()
            item['datePublished'] = r_scope.select(datePublished_path).extract()

            # Simpler to grab the amount and name spans separately,
            # then combine them into a string.
            ingredient_scopes = r_scope.select(ingredients_path)
            amount = ingredient_scopes.select(ingredients_amounts_path).extract()
            name = ingredient_scopes.select(ingredients_names_path).extract()
            ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)]

            item['ingredients'] = ingredients

            # stick this RecipeItem in the array of recipes we will return
            recipes.append(item)

        # more processing is done by the openrecipes.pipelines. Look at that
        # file to see transforms that are applied to each RecipeItem
        return recipes

Exemple #11

0

Afficher le fichier

Fichier : papers.py Projet : oddieis/RSParlyScrapers

 def parse_list(self, response):
     hxs = HtmlXPathSelector(response)
     for href in hxs.select(r'//ul[@id="paper-listing"]//a/@href').extract():
         yield Request(urlparse.urljoin(response.url, href), callback=self.parse_paper)
     next = hxs.select(r'//div[@class="pagination"]/ul/li[@class="next"]/a/@href')
     if len(next):
         yield Request(urlparse.urljoin(response.url, next[0].extract()), callback=self.parse_list)

Exemple #12

0

Afficher le fichier

Fichier : form_spider.py Projet : Diederikjh/phishing-ucker

    def parse(self, response):
	print(response)
	hxs = HtmlXPathSelector(response)
#	forms = hxs.select('//frame')
	#print(forms)
	forms = hxs.select('//form')
	print(forms)
	actions = []
	items = []
	for form in forms:
		formItem = FormItem()
		a = form.select("@action")
		formItem["actionURL"] = a.extract()
		items.append(formItem)
		inputs = form.select(".//input")
		inputItems = []
		for formInput in inputs:
			inputName = formInput.select("@name").extract()
			inputSize = formInput.select("@size").extract()
			inputMaxlength = formInput.select("@maxlength").extract()
			inputValue = formInput.select("@value").extract()
			inputType = formInput.select("@inputType").extract()
			inputItem = InputItem()
			inputItem["name"] = inputName
			inputItem["size"] = inputSize
			inputItem["maxlength"] = inputMaxlength
			inputItem["value"] = inputValue
			inputItem["inputType"] = inputType
			inputItems.append(inputItem)
		formItem["inputs"] = inputItems
		
	
	return items

Exemple #13

0

Afficher le fichier

Fichier : test16.py Projet : sushilovinfun/Scrapy

    def parse_start_url(self, response):
        x = HtmlXPathSelector(response)
        # get the list of posters
        posters = x.select("//b[@class='postauthor']/text()").extract()
        # set the first in list of posters as topic author
        op = posters[0]
        # get the topic url and title
        url = response.url

        title = x.select("//div[@id='pageheader']/h2/a/text()").extract()

        #scrape topic body 
        #But this scrape is not quite working. It is posting 
        #the entire body of the page and not just the specific loop.
        #I am not sure why at all
        post_body = x.select("//div[@class='postbody']").extract()

        # go through list of posters and remove any duplicates
        posters_export = [op]
        for p in posters:
                posters_export.append(p)
        # create an item for each unique poster in the topic
        topics = []
        for i, pb in enumerate(post_body):
            topic = BodytestItem()
            topic['topic_url'] = url
            topic['topic_title'] = title
            topic['thread_author'] = op
            topic['post_author'] = posters[i]
            topic['post_body'] = pb
            topics.append(topic)

        return topics

Exemple #14

0

Afficher le fichier

Fichier : nssfspider.py Projet : leninlal/sample_spiders

    def result_page(self,response):
        hxs=HtmlXPathSelector(response)
        #inspect_response(response)
        store_details=hxs.select('//*[@id="cb_block_inner"]/table/tr//td')
        url=response.url
        state=response.meta['state']
        for store_detail in store_details:
            if ''.join(store_detail.select('.//text()').extract()).strip():
                detials=[det.strip() for det in store_detail.select('.//text()').extract()]
                store_name=detials[1]
                address=[]
                for det in detials[2:]:
                    if det:
                        address.append(det)
                    else:
                        index_break=detials[2:].index('')+2
                        break
                if address:
                    if len(address)==1:
                        address=address[0]
                        location=None
                    else:
                        location=address[-1]
                        address=address[:-1]

                res='!'.join(detials[index_break:])
                if 'Contact' in res:
                    contact_name=res[res.find('t!: ')+4:res.find('!!')]
                ph=re.compile('\(\d+\).\d+\-\d+').findall(res)
                if ph:
                    phone_no=ph[0]
                else:
                    phone_no=None
                item=NssfcrawlerItem(state=state,store_name=store_name,address=address,location=location,contact_name=contact_name,phone_no=phone_no)
                yield item

Exemple #15

0

Afficher le fichier

Fichier : dmoz.py Projet : opyate/linkbaiter

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        hxs = HtmlXPathSelector(response)

        # XPath for an item looks like this:
        # //*[@id="bd-cross"]/fieldset[3]/ul/li[1]
        sites = hxs.select('//ul/li')
        items = []

        for site in sites:
            item = Website()
            # item['name'] = site.select('a/text()').extract()
            # item['url'] = site.select('a/@href').extract()
            # item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
            item['name'] = site.select('a/text()').extract()
            item['url'] = site.select('a/@href').extract()
            # item['description'] = site.select('text()').extract()
            item['description'] = [str.strip() for str in site.select('text()').extract()]
            items.append(item)

        return items

Exemple #16

0

Afficher le fichier

Fichier : pricedealsindia_spider.py Projet : ashish2/spidy

	def parse_item(response):
		#~self.log('Hi, this is an item page! %s' % response.url)
		#~print 'function'
		#~print sys._getframe().f_code.co_name
		#~print 'response'
		#~print response
		#~print type(response)
		#~print "response.url"
		#~print response.url
		
		hxs = HtmlXPathSelector(response)
		
		# ATM, all these item values are coming in a List type with just the 0th key
		item = CompuindiaItem()
		
		item['sourceurl'] = [ response.url ]
		
		#~item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract() # Code: Unicode
		item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract().encode('utf-8') # Code: String
		
		item['price'] = hxs.select('//span[@class="price"]/text()')[0].extract().encode('utf-8')
		
		# left
		item['color'] = [None]
		# Try to do matching with class="last odd"
		#~item['color'] = hxs.select('//tbody/tr[@class="last odd"]') 
		
		item['name'] = hxs.select("//div[@class='product-name']/h1/text()").extract()[0]
		
		#~item['features'] =  hxs.select('//ul[@class="config_listing_pd_page"]/li').extract()
		item['features'] =  hxs.select('//ul[@class="config_listing_pd_page"]/li/text()').extract()
		
		#~item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract()
		item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract()[0].encode('utf-8')
		
		#~item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract()
		item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract()[0].encode('utf-8')
		
		item['moreDescription'] = [None]
		
		#~item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract()
		item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract()[0].encode('utf-8')
		
		item['relatedProducts'] = [None] # FTM
		
		#IMAGES
		main_img = []
		image_urls = []
		main_img = hxs.select("//p[@class='product-image']/a/@href").extract()
		img_urls = hxs.select("//div[@class='more-views']/ul/li/a/@href").extract()
		
		item['image_urls'] =  list( set( main_img + img_urls ) )
		#IMAGES-
		
		#~print 'item'
		#~print item
		#~
		#~sys.exit('S')
		
		return item

Exemple #17

0

Afficher le fichier

Fichier : Consumer.py Projet : riddokun/D3-ScaPY-Yahoo-Articles

    def parse_page2(self, response):  # data stored
        global writer
        sel = HtmlXPathSelector(response)
        # article = ''.join(sel.xpath('//div[@class="body yom-art-content clearfix"]').extract())
        article = ''.join(sel.xpath('//p/text()').extract())
        subheadline = ''.join(sel.xpath('//h2[@class="subheadline"]/text()').extract())
        str2 = ''.join(sel.xpath('//abbr/text()').extract())

        millis = int(round(time.time() * 1000))  # Get the current time in milliseconds
        ntime = 0.0
        if "hour" in str2[1]:
            str3 = str2[1].split(" ")
            ntime += float(str3[0]) * 60
            if "minute" in str2[1]:
                ntime += float(str3[2])
        elif "minute" in str2[1]:
            str3 = str2[1].split(" ")
            ntime += float(str3[0])
        # article_time = datetime.datetime.fromtimestamp((millis - ntime * 60 * 1000)/1000).strftime('%m-%d-%Y %H:%M:%S.%f')
        articletime = " "

        # Grabs the information from parse function
        title = response.meta['Title']
        linktime = response.meta['LinkTime']
        source = response.meta['Source']
        link = response.meta['Link']

        # Stores everything in a CSV file
        Consumer.writer.writerow([title.encode("utf-8"), subheadline.encode("utf-8"), source.encode("utf-8"), linktime.encode("utf-8"),
                                  articletime.encode("utf-8"), article.encode("utf-8"), link.encode("utf-8")])

Exemple #18

0

Afficher le fichier

Fichier : django_checker.py Projet : snapkr/twinxist

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     
     # x_path test
     if self.scraper.checker_type == '4':
         self.log("No 404. Item kept.", log.INFO)
         return
     try:
         test_select = hxs.select(self.scraper.checker_x_path).extract()
     except ValueError:
         self.log('Invalid checker x_path!', log.ERROR)
         return
     if len(test_select) > 0 and self.scraper.checker_x_path_result == '':
         self.log("Elements for XPath found on page (no result string defined).", log.INFO)
         if self.conf['DO_ACTION']:
             self._del_ref_object()
         return
     elif len(test_select) > 0 and test_select[0] == self.scraper.checker_x_path_result:
         self.log("XPath result string '" + self.scraper.checker_x_path_result + "' found on page.", log.INFO)
         if self.conf['DO_ACTION']:
             self._del_ref_object()
         return
     else:
         self.log("XPath result string not found. Item kept.", log.INFO)
         return

Exemple #19

0

Afficher le fichier

Fichier : tigerchefspider.py Projet : 0--key/lib

    def parse(self, response):
        hxs = HtmlXPathSelector(response)

        #categories = hxs.select('//div[@class="sidebar_nav"]//li/a/@href').extract()

        categories = hxs.select('//div[@class="navigation"]/ul/li/a/@href').extract()
        categories += hxs.select('//ul[@class="cl_subs"]//a/@href').extract()
        loaded = False
        for category in categories:
            loaded = True
            yield Request(category)

        next_page = hxs.select('//a[@rel="next"]/@href').extract()
        if next_page:
            base_url = get_base_url(response)
            loaded = True
            yield Request(urljoin_rfc(base_url, next_page[0]))

        products = [product for product in self.parse_products(hxs)]
        for product in products:
            yield product

        if (not products or not loaded) and response.meta.get('retries', 0) < 3:
            yield Request(response.url, dont_filter=True,
                          meta={'retries': response.meta.get('retries', 0) + 1})

Exemple #20

0

Afficher le fichier

Fichier : justvitamins.py Projet : 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract()
        if name:
            name = name[0].strip()
            url = response.url
            url = urljoin_rfc(get_base_url(response), url)
            items = hxs.select('//div[@class="Item"]')
            for item in items:
                loader = ProductLoader(item=Product(), selector=item)
                loader.add_value('url', url)
                #loader.add_value('name', name[0])

                sku = ''.join(item.select('./text()').extract())
                n = name
                if sku:
                    n += ' ' + sku.strip()

                loader.add_value('name', n)
                loader.add_xpath('price', './/span[@class="price"]/text()')
                loader.add_xpath('price', './div[@class="price"]/span/text()')


                yield loader.load_item()

Exemple #21

0

Afficher le fichier

Fichier : livingsocial_spider.py Projet : yanniey/Scrapy_livingsocial_chicago

	def parse(self, response):
		"""
		Default callback used by Scrapy to process downloaded responses
		Testing contracts:
		@url http://www.livingsocial.com/cities/15-san-francisco
		@returns items 1
		@scrapes title link
		"""

		selector = HtmlXPathSelector(response)

		for deal in selector.xpath(self.deals_list_xpath):
			loader = XPathItemLoader(LivingSocialDeal(),selector=deal)

			# define processors
			loader.default_input_processor = MapCompose(unicode.strip) 
			# stripe out white-space of unicode strings
			loader.default_output_processor = Join() 
			# join the data together by a space

			# iterate over fields and add xpaths to the loader
			for field, xpath in self.item_fields.iteritems(): 
			# iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. 
				loader.add_xpath(field, xpath) 
			yield loader.load_item() 
			# yield each other and move on to the next

# output as json file: scrapy crawl livingsocial -o items.json

Exemple #22

0

Afficher le fichier

Fichier : bhl_spider.py Projet : 0--key/lib

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//div[@class="prod"]')
     for product in products:
         loader = ProductLoader(item=Product(), selector=product)   
         #loader.add_xpath('name', 'div/form/fieldset/div/h5/a/span/text()')
         name = product.select('div/form/fieldset/div/h5/a/span/text()').extract()[0].strip()
         url = product.select('div/form/fieldset/div/h5/a/@href').extract()
         if url:
             url =  urljoin_rfc(get_base_url(response), url[0])
         #loader.add_value('url', url)
         #loader.add_xpath('price', 'div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()')
         #yield loader.load_item()
         price = product.select('div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()').extract()[0].strip()
         yield Request(url, callback=self.parse_product, meta={'name':name, 'price':price})
     pages = hxs.select('//span[@class="pagingButton"]/a/@href').extract()
     if pages:
         if response.meta['do_pagination']:
             for page in pages:
                 url =  urljoin_rfc(get_base_url(response), page)
                 yield Request(url, callback=self.parse_products, meta={'do_pagination':False})
     else:
         sub_categories = hxs.select('//div[@class="subcat"]/div/a/@href').extract()
         for sub_category in sub_categories:
             url =  urljoin_rfc(get_base_url(response), sub_category)
             yield Request(url, callback=self.parse_products, meta={'do_pagination':True})

Exemple #23

0

Afficher le fichier

Fichier : card_spider.py Projet : 0--key/lib

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     magic_sets_full = hxs.select('//div[@class="left_block"]//ul[@class="left_menu"]//li/a/text()').extract()
     links_to_magic_sets_full = hxs.select(
         '//div[@class="left_block"]//ul[@class="left_menu"]//li/a/@href'
     ).extract()
     # lets cut first category for debuging purposes:
     magic_sets = magic_sets_full[0]
     links_to_magic_sets = links_to_magic_sets_full[0]
     # self.log("This is first category and link to they: %s, %s, %s" % (type(magic_sets), magic_sets, links_to_magic_sets))
     # Now all magic sets are all together with the links to them:
     # uncoment this after debug:
     # magic_sets_zip = dict(zip(magic_sets, links_to_magic_sets))
     magic_sets_zip = dict([[magic_sets, links_to_magic_sets]])
     date_prefix = time.strftime("%Y%m%d", time.localtime())
     try:
         os.mkdir("./archive/HTML/" + date_prefix)
     except OSError:
         self.log("The folder exists!")
     filename = "./archive/HTML/" + date_prefix + "/" + response.url.split("/")[-1] + ".htm"
     self.log("This is filename for index: %s" % (filename,))
     try:
         open(filename, "wb").write(response.body)
     except OSError:
         os.remove(filename)
         open(filename, "wb").write(response.body)
     # Continue to extract data:
     for magic_set, url in magic_sets_zip.iteritems():
         abs_url = urljoin("http://www.blackborder.com", url)
         self.log("This is magic set name and url to it: %s ---> %s" % (magic_set, abs_url))
         request = Request(abs_url, callback=self.parse_set_page)
         request.meta["magic_set"] = magic_set
         request.meta["date_prefix"] = date_prefix
         yield request

Exemple #24

0

Afficher le fichier

Fichier : ebay.py Projet : warlock-ankur/EbayScrapper

	def getComments(self, response):
		Item = response.meta['item']

		res_text = response.body_as_unicode().encode('ascii', 'ignore')
		res_text = smart_str(self.parser.unescape(self.parser.unescape(res_text))).replace('\xc2\xa0','')
		res_text = res_text.replace('\n', ' ').replace('\t', ' ').replace('\r', '')
		res_text = re.subn('<script.*?</script>', '', res_text)[0]
		res_text = re.subn('<style.*?</style>', '', res_text)[0]
		hxs = HtmlXPathSelector(text=res_text)
		
		tmp = hxs.select('//div[@id="ds_div"]//text()').extract()
		comments = ''
		for val in tmp:
			val = val.strip()
			if val != '':
				comments += val + ' '
		Item['Comments'] = comments

		try:
			offers_url = 'http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=' + Item['eBay_Item_Number']
			if Item['eBay_Item_Number'] != 'NA' and Item['eBay_Item_Number'] != '':
				req = Request(offers_url, dont_filter=True, callback=self.getPostingDate)
				req.meta['item'] = Item
				return req
		except:
			pass

		return Item

Exemple #25

0

Afficher le fichier

Fichier : iocl_petrolprice.py Projet : satishkt/bpp-india

    def parse(self, response):
        log.msg(response.url)
        hxs = HtmlXPathSelector(response)
        items=[]
        variants_date=hxs.select("//span[@class='normal']//text()").extract()
        variants_price=hxs.select("//table[@id='objContPreviousPrices_grdPreviousPrices']//tr//td[@class='normal']//text()").extract()

        price_items=self.__group_iter(variants_price,4)
        av_price=[]
        for price_list in price_items:
             av_price.append(reduce(lambda x, y: float(x) + float(y) / float(len(price_list)), price_list, 0))
        for price, date in zip(variants_price, variants_date):
            item = BillionPricesIndiaItem()
            quantity='1 lt'
            item['date'] = date
            item['vendor'] = "ioc"
            item['product'] = "gasoline"
            item['category'] = "oil and gas"

            value,measure,unitprice=self.__unit_price(price,quantity)
            item['price'] = price
            item['quantity'] = value
            item['measure']= measure
            item['unitprice']=unitprice


            items.append(item)
        return items

Exemple #26

0

Afficher le fichier

Fichier : video_detail.py Projet : hackrole/scrapy-utils

    def parse_qq_com(self, response):
        hxs = HtmlXPathSelector(response)

        pid = ''.join(hxs.re('id :"(\w+)",'))
        vid = ''.join(hxs.re('vid:"(\w+)",'))

        url_t_1 = "http://sns.video.qq.com/tvideo/fcgi-bin/batchgetplaymount?id=%s&otype=json"
        u1 = url_t_1 % (pid,)
        t1 = urllib.urlopen(u1).read()
        pv = ''.join(re.findall('"num":(\d+)', t1)).strip()

        url_t_2 = "http://sns.video.qq.com/tvideo/fcgi-bin/spvote?&t=3&otype=json&keyid=%s"
        u2 = url_t_2 % (vid,)
        t2 = urllib.urlopen(u2).read()
        tmp = re.findall('"num":(\d+)', t2)
        down, up = tmp

        url_t_3 = "http://sns.video.qq.com/fcgi-bin/liveportal/comment?otype=json&p=1&t=0&sz=10&id=%s"
        u3 = url_t_3 % (pid,)
        t3 = urllib.urlopen(u3).read()
        comments = ''.join(re.findall('"totpg":(\d+)', t3))

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Exemple #27

0

Afficher le fichier

Fichier : video_detail.py Projet : hackrole/scrapy-utils

    def parse_youku_com(self, response):
        hxs = HtmlXPathSelector(response)
        video_id = hxs.re('var videoId.*?(\d+)')[0]

        url_t = "http://v.youku.com/v_vpactionInfo/id/%s"
        url = url_t % (video_id,)
        text = urllib.urlopen(url).read()

        hxs2 = HtmlXPathSelector(text=text)
        pv = hxs2.select('//ul[@class="row"]//span[@class="num"]/text()').extract()[0]
        pv = int(''.join(pv.split(',')))

        # others data
        d_tmp = hxs2.select('//ul[@class="half"]//span/text()').extract()
        # up and down data
        ud = d_tmp[0]
        up, down = d_tmp[0].split('/')
        up, down = int(''.join(up.split(','))), int(''.join(down.split(',')))
        # comments count
        comments = int(''.join(d_tmp[2].split(',')))

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Exemple #28

0

Afficher le fichier

Fichier : carphonewarehouse_spider.py Projet : 0--key/lib

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     # Only URL with "/pay-as-you-go/" and has "manufacturer=" in the query
     categories = hxs.select('//map[@name="Mapre"]/area[contains(@href,"/pay-as-you-go/") and contains(@href,"manufacturer=")]/@href').extract()
     for category in categories:
         url = urljoin_rfc(response.url, category, response.encoding)
         yield Request(url, callback=self.parse_cat)

Exemple #29

0

Afficher le fichier

Fichier : video_detail.py Projet : hackrole/scrapy-utils

    def parse_sina_com_cn(self, response):
        hxs = HtmlXPathSelector(response)

        vid = hxs.re('vid:.*?(\d+)\|\d+')[0]
        nid = hxs.re("newsid:'([-\w]+)")[0]
        url_t = "http://count.kandian.com/getCount.php?vids=%s&action=flash"
        url = url_t % ("%s-%s" % (vid, vid))
        data = urllib.urlopen(url).read()
        pv = re.findall('\d+":"(\d+)', data)[0]
        up = 0
        down = 0

        url_tt = "http://comment5.news.sina.com.cn/cmnt/info_wb?channel=movie&newsid=%s&page=1&callback="
        url2 = url_tt % (nid,)
        data2 = urllib.urlopen(url2).read()
        data2 = data2[1:-1]
        dj = json.loads(data2)
        comments = dj["result"]['data']['total_number']

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Exemple #30

0

Afficher le fichier

Fichier : ixbt_spider.py Projet : AndrewLvov/ixbt_jokes

	def parse(self, response):
		hxs = HtmlXPathSelector(response)
		comments = hxs.select('//script[contains(text(),"t_post")]')
		#filename = response.url.split("/")[-2]
		self.file.write('comments: ' + str(len(comments)) + '\n\n')
		#items = []
		for comment in comments:
			#pattern = re.compile(r"'?([^(,]+)'?,")
			pattern = re.compile(r"('(.*?)'|(\d+),)", re.S)
			results = pattern.findall(comment.extract())
			comment_items = list((x[2] if x[2] else x[1]) for x in results)

			item = IxbtItem()
			if len(comment_items) > 5:
				text = comment_items[5]
				item['grats'] = len(text.split(';'))
			else:
				item['grats'] = 0

			item['text'] = []
			text = ''
			if len(comment_items) > 4:
				text = comment_items[4];
				text = re.sub(r'<br>', '\n', text)
				text = re.sub(r'<p>.*<p>', '\n', text)
				text = re.sub(r'\\n', '\n', text)
				#text = re.sub(r'\<.*', '', text)
				#text = re.sub(r'\<[^>]*\>', '', text)
				text = re.sub(r'(\n|^).{1,20}(\n)+', '\n', text)
				#text = re.sub(r'(\n){3,}', '\n\n', text)
				#text = re.sub(r'\s+$', '', text)
				#text = re.sub(r'^\s+', '', text)
				pattern = re.compile(r'(.+?)(\n\n|$)', re.S)
				tuples = pattern.findall(text)
				item['text'] = list(x[0].strip() for x in tuples if len(x[0].strip()) > 12)

			item['author'] = comment_items[1]

			item['url'] = response.url + u'#' + comment_items[0]
				
			if item['grats'] > 2:
				self.file.write('Автор: ' + item['author'].encode('UTF-8') + '\n')
				self.file.write(str(item['grats']) + ' человек сказали спасибо\n')
				self.file.write(item['url'] + '\n')
				s = '\n'.join(item['text'])
				self.file.write('кол-во анекдотов: ' + str(len(item['text'])) + '\n')
				#self.file.write(comment_items[4].encode('UTF-8'))
				for joke in item['text']:
					self.file.write(joke.encode('UTF-8') + '\n\n')

			#items.append(item)
			yield item

		next_url = hxs.select('//script[contains(text(),"t_assign")]').re(u'href=([^ ]*?)>далее')
		if len(next_url) > 0:
			next_url = next_url[0]
			parsed_url = urlparse(next_url)
			next_url = urljoin(response.url, next_url)
			yield Request(next_url, callback=self.parse)
			self.file.write("Следующая страница: " + next_url.encode('UTF-8') + '\n')

Exemple #31

0

Afficher le fichier

Fichier : studiospares_spider.py Projet : oceancloud82/scraping

 def parse_categories(self, response):
     hxs = HtmlXPathSelector(response)
     urls = hxs.select('//*[@id="categorylist"]/ul[@class="categories"]/li/h2/a/@href').extract()
     for url in urls:
         yield Request(url, callback=self.parse_products)

Exemple #32

0

Afficher le fichier

    def parse_one_supporters_page(self, response):

        hxs = HtmlXPathSelector(response)
        # titles = hxs.select("//span[@class='pl']")
        # avoid double parse here???
        backer_url = re.search('[0-9]+', response.url)
        PROJ_ID = -1
        if backer_url != None:
            #	self.log('parse the proj_id in backer page error in %s' %response.url)
            #else:
            PROJ_ID = backer_url.group(0)

        backers = hxs.select(
            "//div[@class='projects-backers-left']/div[@class='supporters']")
        items = []

        for backer in backers:
            item = Proj_Supporter()
            supporter_name = backer.select(
                ".//div[@class='supportersmeta']/div[@class='supportersmeta-t']/a[@class='supportersmeta-t-a']/text()"
            ).extract()
            supporter_id = backer.select(
                ".//div[@class='supportersmeta']/div[@class='supportersmeta-t']/a[@class='supportersmeta-t-a']/@href"
            ).extract()
            supporter_icon = backer.select(
                ".//div[@class='supportersmeta']/div[@class='supportersmeta-t']/div[@class='icon-sun-ms']/a/text()"
            ).extract()
            supporter_total_support_proj = backer.select(
                ".//div[@class='supportersmeta']/text()[4]").extract()
            supporter_support_time = backer.select(
                ".//div[@class='supportersmeta']/text()[2]").extract()
            supporter_support_amount = backer.select(
                ".//div[@class='supportersmeta']/text()[3]").extract()
            #print "supporter name", supporter_name
            #print "supporter url", supporter_url
            #print "supporter icon level ", supporter_icon
            #print "supporter_total_support_proj ", supporter_total_support_proj
            #print "supporter_support_time ", supporter_support_time
            #print "supporter total support", supporter_support_amount
            if len(supporter_name) == 1:
                item['supporter_name'] = supporter_name[0]
            if len(supporter_id) == 1:
                item['supporter_id'] = item.clean_supporter_id(supporter_id[0])
            if len(supporter_icon) == 1:
                item['supporter_icon'] = item.clean_supporter_icon(
                    supporter_icon[0])
            if len(supporter_support_time) == 1:
                item[
                    'supporter_support_time'] = item.clean_supporter_support_time(
                        supporter_support_time[0])
            if len(supporter_support_amount) == 1:
                item['supporter_support_amount'] = supporter_support_amount[0]
            if len(supporter_total_support_proj) == 1:
                item[
                    'supporter_total_support_proj'] = item.clean_supporter_total_support_proj(
                        supporter_total_support_proj[0])
            item['supporter_proj_id'] = PROJ_ID
            items.append(item)

        for item in items:
            yield item

        # return items
        """

Exemple #33

0

Afficher le fichier

    def parse_proj_info(self, response):
        hxs = HtmlXPathSelector(response)

        ##################################################################################################################
        # section of proj table
        # (proj_url, proj_id(PK), proj_name, proj_funding_target, proj_current_funding_amount, proj_current_funding_percentage, proj_status, proj_left_over_time, proj_owner_name,
        #   proj_owner_location, proj_supporter_count, proj_surfer_count, proj_topic_count)
        ###################################################################################################################
        proj = Proj_Item()
        # get proj url, add prefix to get the complete url
        proj_url = hxs.select(
            "//div[@class='ui-tab']/div[@class='ui-tab-top']/h1/a/@href"
        ).extract()
        if len(proj_url) != 1:
            self.log("Parse the proj url error. %s" % response.url)
            return
        else:
            proj['proj_url'] = self.add_url_prefix(proj_url[0])

        # one very important id -->Proj_Id
        # if len(
        PROJ_ID = proj_url[0].split('/')
        if len(PROJ_ID) != 3:
            self.log("Parse Proj_id error. %s" % response.url)
        else:
            PROJ_ID = PROJ_ID[len(PROJ_ID) - 1]
            proj['proj_id'] = PROJ_ID

        # get the proj name
        proj_title = hxs.select(
            "//div[@class='ui-tab']/div[@class='ui-tab-top']/h1/a/text()"
        ).extract()
        if len(proj_title) != 1:
            self.log("Parse the proj name error. %s" % response.url)
        else:
            proj['proj_name'] = proj_title[0]

        projs_sidebar_funding = hxs.select("//div[@class='sidebar-funding']")
        if len(projs_sidebar_funding) == 0:
            projs_sidebar_funding = hxs.select(
                "//div[@class='sidebar-warming']")
            if len(projs_sidebar_funding) == 0:
                projs_sidebar_funding = hxs.select(
                    "//div[@class='sidebar-success']")
                if len(projs_sidebar_funding) == 0:
                    projs_sidebar_funding = hxs.select(
                        "//div[@class='sidebar-failure']")

        if (len(projs_sidebar_funding) != 1):
            self.log("Parse the proj table error. %s" % response.url)
            print "Parse the proj table error. %s" % response.url
        else:
            # get proj_funding_target
            p = projs_sidebar_funding[0]
            proj_funding_target = p.select(
                ".//div[@class='sidebar-money-raised-num-t']/b/text()"
            ).extract()
            print proj_funding_target
            if len(proj_funding_target) == 1:
                proj['proj_funding_target'] = proj.clean_proj_funding_target(
                    proj_funding_target[0])

            # get proj_current_funding_amount
            proj_current_funding_amount = p.select(
                ".//div[@class='sidebar-money-raised-num']/b/text()").extract(
                )
            print proj_current_funding_amount
            if len(proj_current_funding_amount) == 1:
                proj[
                    'proj_current_funding_amount'] = proj.clean_proj_current_funding_amount(
                        proj_current_funding_amount[0])

            # get proj_current_funding_percentage
            proj_current_funding_percentage = p.select(
                ".//span[@class='sidebar-percentage-progress-span']/text()"
            ).extract()
            print proj_current_funding_percentage
            if len(proj_current_funding_percentage) != 1:
                self.log(
                    "Parse the proj_current_funding_percentage at url = %s" %
                    response.url)
            else:
                percentage = re.search('[\d]+',
                                       proj_current_funding_percentage[0])
                if percentage == None:
                    self.log(
                        "Parse the proj_current_funding_percentage at url = %s"
                        % response.url)
                else:
                    percentage = percentage.group(0)
                    proj['proj_current_funding_percentage'] = Decimal(
                        percentage.strip('"')) / 100

            # this is how many people support this proj
            proj_supporter_count = p.select(
                ".//div[@class='sidebar-number-days-l']/b/b/text()").extract()
            print "support num:", proj_supporter_count
            if len(proj_supporter_count) == 1:
                proj['proj_supporter_count'] = proj_supporter_count[0]

            # this is how many people view this proj
            proj_surfer_count = p.select(
                ".//div[@class='sidebar-number-days-m']/b/b/text()").extract()
            print "people view ", proj_surfer_count
            if len(proj_surfer_count) == 1:
                proj['proj_surfer_count'] = proj_surfer_count[0]

            # get topic of the proj
            topic_count = hxs.select(
                "//ul[@class='ui-tab-menu']/li/a/span[@id='posts_count']/text()"
            ).extract()
            if len(topic_count) != 1:
                self.log("Parse topic count error. %s" % response.url)
                print "Parse topic count error. %s" % response.url
            else:
                proj['proj_topic_count'] = topic_count[0]

            # get the proj_status
            proj_status = p.select(
                ".//div[@class='sidebar-number-days-r']/span/text()").extract(
                )
            if len(proj_status) != 1:
                self.log("Parse proj status error. %s" % response.url)
                print "Parse proj status error. %s" % response.url
            else:
                proj['proj_status'] = proj_status[0]

            # get how many days left
            proj_leftover_time = p.select(
                ".//div[@class='sidebar-number-days-r']/b/b/text()").extract()
            print "days left ", proj_leftover_time
            if len(proj_leftover_time) == 1:
                proj['proj_leftover_time'] = proj_leftover_time[0]

            # get the unit of left_over
            proj_leftover_time_units = p.select(
                ".//div[@class='sidebar-number-days-r']/b/text()").extract()
            if len(proj_leftover_time_units) == 1:
                proj['proj_leftover_time_unit'] = 0  # proj complete
            elif len(proj_leftover_time_units) == 2:
                proj['proj_leftover_time_unit'] = proj_leftover_time_units[1]
            else:
                self.log("Can not parse proj left over time at url=%s" %
                         response.url)
                print "Parse proj left over time error. %s" % response.url

        # get proj_owner information
        projs_owner = hxs.select("//div[@class='project-by']")
        if len(projs_owner) != 1:
            self.log("Parse proj owner error. %s" % response.url)
        else:
            p = projs_owner[0]
            proj_owner_owner_name = p.select(
                ".//a[@class='project-by-img-r-author']/text()").extract()
            if len(proj_owner_owner_name) == 1:
                proj['proj_owner_name'] = proj_owner_owner_name[0]

        # get proj_location --> this wil be extracted in another table
        # reason is this information may not be available at back page, only exist in main page
        yield proj
        # end of section of proj table
        ##################################################################################################################

        ##################################################################################################################
        # section of section of proj_owner_table
        # (proj_owner_owner_id(PK), proj_owner_proj_id(PK), proj_owner_owner_name, proj_owner_star_level, proj_owner_last_log_in_time,
        #  proj_owner_own_proj_count, proj_owner_support_proj_count )
        ##################################################################################################################
        projs_owner = hxs.select("//div[@class='project-by']")
        if len(projs_owner) != 1:
            self.log("Parse the proj_owner error. %s" % response.url)
            print "Parse the proj_owner error. %s" % response.url
        else:
            p = projs_owner[0]
            proj_owner = Proj_Owner_Item()

            proj_owner_owner_id = p.select(
                ".//a[@class='project-by-img-r-author']/@href").extract()
            print "proj name url: ", proj_owner_owner_id
            if len(proj_owner_owner_id) != 1:
                self.log("Parse proj owner id from page %s error" %
                         response.url)
            else:
                owner_id = re.search('[0-9]+$', proj_owner_owner_id[0])
                if owner_id == None:
                    self.log("Extract the proj owner id from url = %s error" %
                             response.url)
                else:
                    proj_owner['proj_owner_owner_id'] = owner_id.group(0)

            proj_owner['proj_owner_proj_id'] = PROJ_ID

            proj_owner_owner_name = p.select(
                ".//a[@class='project-by-img-r-author']/text()").extract()
            print "proj name: ", proj_owner_owner_name
            if len(proj_owner_owner_name) == 1:
                proj_owner['proj_owner_owner_name'] = proj_owner_owner_name[0]

            proj_owner_star_level = p.select(
                ".//div[@class='project-by-img-r']/div[@class='icon-sun-m']/a/text()"
            ).extract()
            print "proj proj_owner_star_level: ", proj_owner_star_level
            if len(proj_owner_star_level) == 1:
                proj_owner['proj_owner_star_level'] = proj_owner_star_level[0]

            proj_owner_last_log_in_time = p.select(
                ".//div[@class='project-by-last-time']/text()").extract()
            print "proj last update time,", proj_owner_last_log_in_time
            log_in = re.search('[\d]+/[\d]+/[\d]+',
                               proj_owner_last_log_in_time[0])
            if log_in == None:
                self.log(
                    "parse proj owner proj_owner_last_log_in_time error at page %s"
                    % response.url)
            else:
                proj_owner['proj_owner_last_log_in_time'] = log_in.group(0)

            proj_by_post_support_list = p.select(
                ".//div[@class='project-by-post']/a[@target='_blank']/span/text()"
            ).extract()
            proj_owner_support_proj_count = 0
            proj_owner_own_proj_count = 0
            if len(proj_by_post_support_list) >= 1:
                proj_owner_support_proj_count = proj_by_post_support_list[0]
                proj_owner[
                    'proj_owner_support_proj_count'] = proj_by_post_support_list[
                        0]
            if len(proj_by_post_support_list) >= 2:
                proj_owner_own_proj_count = proj_by_post_support_list[1]
                proj_owner[
                    'proj_owner_own_proj_count'] = proj_by_post_support_list[1]
            print "proj owner supports:", proj_owner_support_proj_count
            print "proj owner owns:", proj_owner_own_proj_count

            yield proj_owner
        # end of section of proj_owner_table
        ##################################################################################################################

        ##################################################################################################################
        # section of donation table, we need to follow the link within the donor page (pagination)
        ##########################################################################################
        #u'/projects/318262/backers'                                                             #
        # >>> response.url                                                                       #
        # 'http://www.demohour.com/projects/318262'                                              #
        ##########################################################################################

        backers = hxs.select(
            "//div[@class='ui-tab-layout']/ul[@class='ui-tab-menu']/li/a/@href"
        )
        if len(backers) == 3:  # we have current tab, posts and backers tab
            backer_relative_urls = backers[2].extract().split('/')
            backer_relative_url = backer_relative_urls[
                len(backer_relative_urls) - 1]
            backers_full_url = response.url + '/' + backer_relative_url
            yield Request(backers_full_url, self.parse_backers_links)

            for supporter in self.parse_backers_links(
                    response):  # we have supporter information here
                print "supporter name:", supporter['supporter_name']
                print "supporter url:", supporter['supporter_url']
                print "supporter icon:", supporter['supporter_icon']
                print "supporter support time", supporter[
                    'supporter_support_time']
                print "supporter support amount", supporter[
                    'supporter_support_amount']
                print "supporter support total proj count", supporter[
                    'supporter_total_support_proj']
                supporter['supporter_proj_id'] = PROJ_ID
                yield supporter
        # end of section of donation table
        ##################################################################################################################

        # if we want to add the user information table, we will do sth similar to the back table here

        ###################################################################################################################################
        # section of Topic table
        # (topic_proj_id(PK), topic_total_buzz_count, topic_announcement_count, topic_question_count, topic_up_count, topic_down_count, topic_proj_category, topic_proj_location )
        ###################################################################################################################################

        projs_topic = hxs.select("//div[@class='projects-home-left']")
        if len(projs_topic) == 1:
            #self.log("Parse the topic at the end of the page error at url = %s" %response.url)
            #else:
            proj_topic = Proj_Topic()

            proj_topic['topic_proj_id'] = PROJ_ID

            # get the topic_total_buzz_count
            topic_total_buzz_count = projs_topic.select(
                ".//li/a[@id='filter_all']/span/text()").extract()
            if len(topic_total_buzz_count) != 1:
                self.log("Parse topic_total_buzz_count error at url = %s" %
                         response.url)
            else:
                proj_topic['topic_total_buzz_count'] = topic_total_buzz_count[
                    0]

            topic_all_count = projs_topic.select(
                ".//li/a[@data-remote='true']/span/text()").extract()
            if len(topic_all_count) < 5:
                self.log("Parse other buzz count error at url = %s" %
                         response.url)
            else:
                proj_topic['topic_announcement_count'] = topic_all_count[1]
                proj_topic['topic_question_count'] = topic_all_count[2]
                proj_topic['topic_up_count'] = topic_all_count[3]
                proj_topic['topic_down_count'] = topic_all_count[4]

            # now we will get the proj tags, e.g., category, location
            projs_tag = hxs.select(
                ".//div[@class='projects-home-left-seat']/a[@target='_blank']/text()"
            ).extract()
            if len(projs_tag) != 3:
                self.log("Parse proj tag error at url = %s" % response.url)
                return
            else:
                proj_topic['topic_proj_category'] = projs_tag[0]
                proj_topic['topic_proj_owner_name'] = projs_tag[1]
                proj_topic['topic_proj_location'] = projs_tag[2]

            yield proj_topic

        # yield item
        ###################################################################################################################################
        # section of incentive/reward table
        # (incentive_proj_id(PK), incentive_id(PK), incentive_expect_support_amount, incentive_current_supporter_count, incentive_total_allowable_supporter_count,
        #   incentive_description, incentive_reward_shipping_method, incentive_reward_shipping_time)
        ###################################################################################################################################
        projs_reward_options = hxs.select("//div[@class='reward-options']/ul")
        rewards = []
        firstIncentive = True
        for p in projs_reward_options:
            reward = Proj_Incentive_Options_Item()

            reward['incentive_proj_id'] = PROJ_ID

            # get incentive_expect_support_amount
            incentive_expect_support_amount = p.select(
                ".//li[@class='support-amount']/text()[2]").extract()
            print "support amount: ", incentive_expect_support_amount
            if len(incentive_expect_support_amount) == 1:
                reward[
                    'incentive_expect_support_amount'] = reward.clean_expect_support_amount(
                        incentive_expect_support_amount[0])
                # if len(support_amount) == 1:
                #	reward['incentive_expect_support_amount'] = support_amount[0]

            # get incentive_current_supporter_count
            incentive_current_supporter_count = p.select(
                ".//li[@class='support-amount']/span/text()").extract()
            print "supporter number:", incentive_current_supporter_count
            if len(incentive_current_supporter_count) == 1:
                count = reward.clean_current_supporter_count(
                    incentive_current_supporter_count[0])
                if len(count) == 1:
                    reward['incentive_current_supporter_count'] = count[0]

            # get incentive_total_allowable_supporter_count, if any
            incentive_total_allowable_supporter_count = p.select(
                ".//li[@class='supporter-number']/div[@class='supporter-limit']/p/text()"
            ).extract()
            if len(incentive_total_allowable_supporter_count) == 1:
                quote = reward.clean_total_allowable_supporter_count(
                    incentive_total_allowable_supporter_count[0])
                if len(quote) >= 1:
                    reward[
                        'incentive_total_allowable_supporter_count'] = quote[0]

            # get incentive_description,
            incentive_description = p.select(
                ".//li[@class='returns-contents']/p/text()").extract()
            if len(incentive_description) >= 1:
                reward[
                    'incentive_description'] = reward.clean_incentive_descriptions(
                        incentive_description[0])

            # get incentive_reward_shipping_method, if any
            incentive_reward_shipping_time_and_method = p.select(
                ".//li[@class='returns-contents-time']/p/text()").extract()
            if len(incentive_reward_shipping_time_and_method) == 1:
                shipping_time = reward.clean_reward_shipping_time(
                    incentive_reward_shipping_time_and_method[0])
                if len(shipping_time) >= 1:
                    reward['incentive_reward_shipping_time'] = shipping_time[0]
            elif len(incentive_reward_shipping_time_and_method) == 2:
                shipping_method = incentive_reward_shipping_time_and_method[0]
                reward['incentive_reward_shipping_method'] = shipping_method
                time = reward.clean_reward_shipping_time(
                    incentive_reward_shipping_time_and_method[1])
                if len(time) >= 1:
                    reward['incentive_reward_shipping_time'] = time[0]

            rewards.append(reward)

        ###################################################################################################################################
        # end of table incentive/reward
        ###################################################################################################################################
        for reward in rewards:
            yield reward

Exemple #34

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        identifier = re.search('-([\d\,]+)\.html', response.url).group(1)
        product_name = hxs.select(
            '//div[@itemprop="name"]/text()')[0].extract().strip()
        base_price = hxs.select('//p[@itemprop="Price"]/text()')[0].extract()
        base_price_decimal = hxs.select(
            '//p[@itemprop="Price"]/span[@class="decimal"]/text()').extract()
        if base_price_decimal:
            base_price += base_price_decimal[0]
        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        category = hxs.select('//div[@id="filCateg"]/a/text()').extract()
        brand = hxs.select(
            '//span[@itemprop="brand"]/text()')[0].extract().strip()
        out_of_stock = hxs.select('//img[@id="BoutonIndispo"]')

        models = hxs.select('//table[@class="tabModeles"]/tr[@class="tr_FA"]')
        for model in enumerate(models):
            i = str(model[0])
            model = model[1]

            model_name = model.select(
                './/td[@class="ref"]/text()')[0].extract().strip()
            model_price = model.select(
                './/td[@class="prix"]/span[@class="Normal" or @class="NormalSansCoupon" or @class="Promo" or @class="TopPrix"]/text()'
            )[0].extract()
            model_price_decimal = model.select(
                './/td[@class="prix"]/span[@class="Normal" or @class="NormalSansCoupon" or @class="Promo" or @class="TopPrix"]/span[@class="decimal"]/text()'
            ).extract()
            if model_price_decimal:
                model_price += model_price_decimal[0]

            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('identifier', '{}.{}'.format(identifier, i))
            loader.add_value('sku', model_name)
            loader.add_value('url', response.url)
            loader.add_value('name', u'{} {}'.format(product_name, model_name))
            if image_url:
                loader.add_value('image_url',
                                 urljoin_rfc(base_url, image_url[0]))
            loader.add_value('brand', brand)
            loader.add_value('category', category[-1] if category else '')
            loader.add_value('price', model_price)
            if out_of_stock:
                loader.add_value('stock', 0)

            yield loader.load_item()

        if not models:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('identifier', identifier)
            loader.add_value('url', response.url)
            loader.add_value('name', product_name)
            if image_url:
                loader.add_value('image_url',
                                 urljoin_rfc(base_url, image_url[0]))
            loader.add_value('brand', brand)
            loader.add_value('category', category[-1] if category else '')
            loader.add_value('price', base_price)
            if out_of_stock:
                loader.add_value('stock', 0)

            yield loader.load_item()

Exemple #35

0

Afficher le fichier

Fichier : wms.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        if response.url == 'http://www.wms.co.uk/Pulse_Oximetry/Handheld_Pulse_Oximeters/Huntleigh_Smartsigns_MiniPulse_MP1R_Rechargeable_Pulse_Oximeter?PC=W6609':
            text = response.body.replace('<3kg', '&lt;3kg')
            hxs = HtmlXPathSelector(text=text)
        else:
            hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
        image_url = hxs.select('//*[@id="Images_Main"]/@src').extract()
        image_url = urljoin(base_url, image_url[0]) if image_url else ''
        category = hxs.select(
            '//*[@id="Breadcrumb_Div"]/div/a[2]/text()').extract()
        category = category[0] if category else ''

        products = hxs.select(
            '//*[@id="Product_Div_Outer"]//div[@class="Product_Grid_Outer"]')

        for product in products:

            try:
                price = product.select(
                    './/div[@class="Product_Grid_Price"]/text()').extract(
                    )[0].strip()
                price = extract_price(price)
            except Exception as e:
                self.log("Couldn't find price for product {}, error code: {}".
                         format(response.url, e))
                continue

            availability = product.select(
                './/div[@class="Product_Grid_Availability"]/text()').extract()
            if availability and availability[0].strip(
            ) == 'This product is no longer available':
                self.log('Product {} is no longer available'.format(
                    response.url))
                continue

            options = product.select(
                './/select[@class="Product_Grid_Variant_Select"]/option')
            if options:
                x = hxs.select(
                    '//script[contains(text(), "SwapVariant(event, intPC)")]'
                ).extract()[0]
                options_availability_lines = x.split('\r\n')
                name = product.select(
                    './div[@class="Product_Grid_Description"]/text()').extract(
                    )[0].strip()
                for option in options:
                    loader = ProductLoader(item=Product(), selector=product)
                    identifier = option.select('./@value').extract()[0]
                    option_name = option.select(
                        './text()').extract()[0].strip()
                    option_availability = ''
                    for line in options_availability_lines:
                        if identifier in line:
                            if any(word in line for word in [
                                    'strInner = ""',
                                    'Please contact us for availability',
                                    'None in stock'
                            ]):
                                option_availability = 'out of stock'
                            if 'This product is no longer available' in line:
                                option_availability = 'delisted'
                            # self.log("==============={}================".format(line))
                    if option_availability == 'delisted':
                        self.log('Product {} is delisted'.format(response.url))
                        continue
                    elif option_availability == 'out of stock':
                        loader.add_value('stock', 0)
                    loader.add_value('url', response.url)
                    loader.add_value('name', name + ' ' + option_name)
                    loader.add_value('image_url', image_url)
                    loader.add_value('category', category)
                    price_line = ''
                    for line in response.body_as_unicode().split('\n'):
                        if identifier.upper() in line.upper(
                        ) and 'PRICE' in line.upper():
                            price_line = line
                    option_price = re.findall("strPrice = (.*)<br", price_line)
                    option_price = extract_price(
                        option_price[0]) if option_price else 0
                    option_price = option_price if option_price else price
                    loader.add_value('price', option_price)
                    loader.add_value('sku', identifier)
                    loader.add_value('identifier', identifier)
                    if int(price) <= 100:
                        loader.add_value('shipping_cost', 5.33)
                    yield loader.load_item()

            else:
                loader = ProductLoader(item=Product(), selector=product)
                availability = product.select(
                    './/div[@class="Product_Grid_Availability"]/span/text()'
                ).extract()
                if availability:
                    availability = availability[0].strip()
                    if 'None in stock' in availability or 'Please contact us for availability' in availability:
                        loader.add_value('stock', 0)
                loader.add_value('url', response.url)
                name = product.select(
                    './div[@class="Product_Grid_Description"]/text()').extract(
                    )
                if not name:
                    name = hxs.select("//h1/text()").extract()
                name = name.pop().strip()
                sku = product.select(
                    './div[@class="Product_Grid_Code_Availability_Outer"]//strong/text()'
                ).extract()
                if not sku:
                    sku = product.select(
                        './/div[@class="Product_Grid_Code_Availability_Outer"]//text()'
                    ).re(".* (.*)")
                sku = sku.pop().strip()
                loader.add_value('name', name)
                loader.add_value('image_url', image_url)
                loader.add_value('category', category)
                loader.add_value('price', price)
                loader.add_value('sku', sku.strip())
                identifier = product.select(
                    './/input[contains(@name,"PC")]/@value').extract()[0]
                if sku.strip() != identifier.strip():
                    loader.add_value('identifier',
                                     identifier.strip() + '-' + sku.strip())
                else:
                    loader.add_value('identifier', identifier.strip())
                if int(price) <= 100:
                    loader.add_value('shipping_cost', 5.33)
                yield loader.load_item()

        other_products = hxs.select(
            '//div[@id="You_May_Need"]/div[@class="Product_Page_Accessories_Row"]'
        )
        for product in other_products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_value('url', response.url)
            name = product.select(
                'div[contains(@class, "_Description")]/text()').extract()
            if not name:
                self.log('Name not found for product {}'.format(response.url))
                continue
            loader.add_value('name', name[0])
            loader.add_value('category', category)
            price = product.select(
                'div[contains(@class, "_Price")]/strong/font/text()').extract(
                )[0].strip()
            price = extract_price(price)
            loader.add_value('price', price)
            sku = product.select(
                'div[contains(@class, "_Code")]/div/strong/text()').extract(
                )[0].strip()
            loader.add_value('sku', sku)
            loader.add_value('identifier', sku)
            if int(price) <= 100:
                loader.add_value('shipping_cost', 5.33)
            yield loader.load_item()

Exemple #36

0

Afficher le fichier

Fichier : crunchbase.py Projet : roadt/scrapybot

 def parse_person_detail(self, response):
     hxs = HtmlXPathSelector(response)
     i = response.meta.get('person') or Person()
     i['long_description'] = "\n".join(hxs.select('//h1[@class="h1_first"]/following-sibling::p/text()').extract())
     i['presence'] = ','.join(hxs.select('//h2[text()[normalize-space(.)="Web Presences"]]/following-sibling::div[position()=1]//text()').extract()).strip()
     yield i

Exemple #37

0

Afficher le fichier

Fichier : crunchbase.py Projet : roadt/scrapybot

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     for url in hxs.select('//ul[@class="col1_alpha_nav"]/li/a/@href').extract():
         yield Request(self.url_base + url, callback=self.parse_company_list)

Exemple #38

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=Product(), response=response)
        brand = hxs.select('//span[@itemprop="brand"]/span/text()').extract()
        brand = brand[0] if brand else ''

        product_name = hxs.select('//h1[@itemprop="name"]/text()').extract()
        product_name = product_name[0].strip()

        product_price = response.xpath(
            '//meta[@itemprop="price"]/@content').extract_first()

        product_price = extract_price(product_price)

        product_code = hxs.select(
            '//div[@class="product-name"]/meta[@itemprop="sku"]/@content'
        ).extract()[0]

        image_url = hxs.select(
            '//div[@class="product-img-box"]/div/a/img/@src').extract()
        if not image_url:
            image_url = hxs.select(
                '//div[@id="imageShowcase"]/img/@src').extract()

        image_url = image_url[0] if image_url else ''

        categories = hxs.select(
            '//div[@class="breadcrumbs"]/ul/li/a/text()').extract()

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('name', product_name)
        loader.add_value('url', response.url)
        loader.add_value('sku', product_code)
        loader.add_value('identifier', product_code)
        loader.add_value('brand', brand)
        loader.add_value('shipping_cost', '4.99')
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url))

        for category in categories:
            if category.upper() != 'BRANDS':
                loader.add_value('category', category)

        loader.add_value('price', product_price)

        out_of_stock = hxs.select('//p[@class="availability out-of-stock"]')
        if out_of_stock:
            loader.add_value('stock', 0)

        item = loader.load_item()
        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            option_item = deepcopy(item)
            product_data = json.loads(options_config.groups()[0])
            products = {}
            prices = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))
                        prices[product] = prices.get(
                            product, 0) + extract_price(option['price'])

            options_containers = hxs.select(
                '//select[contains(@name, "options[")]')
            extra_options = []
            if len(options_containers) > 1:
                combined_options = []
                for options_container in options_containers:
                    element_options = []
                    for option in options_containers.select(
                            'option[@value!=""]'):
                        option_id = option.select('@value').extract()[0]
                        option_name = option.select('text()').extract()[0]
                        option_price = option.select('@price').extract()[0]
                        option_attr = (option_id, option_name, option_price)
                        element_options.append(option_attr)
                    combined_options.append(element_options)
                    combined_options = list(
                        itertools.product(*combined_options))

                for combined_option in combined_options:
                    final_option = {}
                    for option in combined_option:
                        final_option['desc'] = final_option.get(
                            'desc', '') + ' ' + option[1]
                        final_option['identifier'] = final_option.get(
                            'identifier', '') + '-' + option[0]
                        final_option['price'] = final_option.get(
                            'price', 0) + extract_price(option[2])
                    extra_options.append(final_option)
            else:
                for option in options_containers.select('option[@value!=""]'):
                    final_option = {}
                    final_option['desc'] = ' ' + option.select(
                        'text()').extract()[0]
                    final_option['identifier'] = '-' + option.select(
                        '@value').extract()[0]
                    final_option['price'] = extract_price(
                        option.select('@price').extract()[0])
                    extra_options.append(final_option)

            product_price = extract_price(product_data['basePrice'])
            for option_identifier, option_name in products.iteritems():
                option_item[
                    'identifier'] = product_code + '-' + option_identifier
                option_item['name'] = product_name + option_name
                option_item[
                    'price'] = product_price + prices[option_identifier]
                option_item['sku'] = option_item['identifier']
                if extra_options:
                    for extra_option in extra_options:
                        extra_opt_item = deepcopy(option_item)
                        extra_opt_item['identifier'] = extra_opt_item[
                            'identifier'] + extra_option['identifier']
                        extra_opt_item['name'] = extra_opt_item[
                            'name'] + extra_option['desc']
                        extra_opt_item['price'] = extra_opt_item[
                            'price'] + extra_option['price']
                        extra_opt_item['sku'] = option_item['identifier']
                        yield extra_opt_item
                else:
                    yield option_item
        else:
            yield item

Exemple #39

0

Afficher le fichier

Fichier : camerapro_spider.py Projet : oceancloud82/scraping

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     categories = hxs.select(
         '//ul[@id="menu"]//li[@class="level1"]/a/@href').extract()
     for category_url in categories:
         yield Request(category_url, callback=self.parse_category)

Exemple #40

0

Afficher le fichier

Fichier : studiospares_spider.py Projet : oceancloud82/scraping

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     urls = hxs.select('//*[@id="catNav"]/li/a/@href').extract()
     for url in urls:
         yield Request(url, callback=self.parse_categories)

Exemple #41

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        category = hxs.select('//div[@id="bCrumb"]/span/a/text()').extract()
        category = category[-1] if category else response.meta.get(
            'category', '')

        colours = hxs.select(
            '//select[@id="cphMain_ddlColour"]/option[@value!="0"]/@value'
        ).extract()
        no_option_selected = hxs.select(
            '//select[@id="cphMain_ddlColour"]/option[@value="0" and @selected]/@value'
        )
        if colours and no_option_selected:
            for colour in colours:
                formdata = {}
                inputs = hxs.select('//form[@id="frmMain"]//input')
                for input in inputs:
                    name = ''.join(input.select('@name').extract())
                    value = ''.join(input.select('@value').extract())
                    formdata[name] = value
                formdata['ctl00$cphMain$ddlColour'] = colour
                form_url = hxs.select(
                    '//form[@id="frmMain"]/@action').extract()[0]
                yield FormRequest(form_url,
                                  dont_filter=True,
                                  method='POST',
                                  formdata=formdata,
                                  callback=self.parse_product,
                                  meta={'category': category})
            return

        loader = ProductLoader(item=Product(), selector=hxs)

        identifier = hxs.select('//div[@class="code"]/text()').extract()[0]
        loader.add_xpath('sku', '//div[@class="code"]/text()')
        loader.add_value('url', response.url)
        product_name = hxs.select(
            '//div[@class="title"]//h1/text()').extract()[0]

        loader.add_value('category', category)
        img = hxs.select('//img[@id="cphMain_imgThumb"]/@src').extract()
        if img:
            loader.add_value('image_url',
                             urljoin_rfc(get_base_url(response), img[0]))

        loader.add_xpath('brand', '//span[@class="brand"]/text()')
        loader.add_value('stock', '1')
        if loader.get_output_value('price') < 50.00:
            loader.add_value('shipping_cost', '4.95')
        else:
            loader.add_value('shipping_cost', '0')

        price = hxs.select('//span[@class="price"]/text()').extract()

        if colours:
            colour = hxs.select(
                '//select[@id="cphMain_ddlColour"]/option[@selected]/text()'
            ).extract()[0]
            colour_id = hxs.select(
                '//select[@id="cphMain_ddlColour"]/option[@selected]/@value'
            ).extract()[0]
            loader.add_value('identifier', identifier + '-' + colour_id)
            loader.add_value(
                'name',
                product_name + ' - ' + colour.split(u' - \xa3')[0].strip())
            option_price = re.search(r"\xa3(\d+.\d+)", colour)
            if option_price:
                loader.add_value('price', option_price.group(1))
            else:
                loader.add_value('price', price)
            colour = colour.split(u' - \xa3')[0].strip()
        else:
            colour = hxs.select(
                '//span[@id="cphMain_lblSelectedColour"]/b/text()').extract()
            if colour:
                product_name = product_name + ' - ' + colour[0].strip()
            colour = ''.join(colour)
            loader.add_value('identifier', identifier)
            loader.add_value('name', product_name)
            loader.add_value('price', price)

        image_id = hxs.select('//img[@alt="' + colour.strip().upper() +
                              '"]/@src').re(r'Products/(\d+)-')
        if not image_id:
            image_id = hxs.select('//img[@alt="' + colour.strip() +
                                  '"]/@src').re(r'Products/(\d+)-')
        prod_id = re.search(r'ProdId=(.*)&', response.url)
        if prod_id and image_id:
            image_id = image_id[0]
            prod_id = prod_id.group(1)
            product = loader.load_item()
            image_page = 'http://www.gooutdoors.co.uk/ZoomProductImages.aspx?ProductId=%s&ProductImageId=%s' % (
                prod_id, image_id)
            yield Request(image_page,
                          callback=self.parse_image,
                          meta={'product': product})
        else:
            yield loader.load_item()

Exemple #42

0

Afficher le fichier

Fichier : mobiles.py Projet : sundarpy/Bitshop

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     items = []
     item = AmazonItem()
     item['title'] = hxs.select(
         '//div[@class="a-section a-spacing-none"]/h1/span[@id="productTitle"]/text()'
     ).extract()
     item['brand'] = hxs.select('//a[@id="brand"]/text()').extract()
     item['specs'] = hxs.select(
         '//div[@class="pdTab"][1]//node()').extract()
     item['offerprice'] = hxs.select(
         '//span[@id="priceblock_ourprice"]/text()').extract()
     item['saleprice'] = hxs.select(
         '//span[@id="priceblock_saleprice"]/text()').extract()
     item['description'] = hxs.select(
         '//div[@id="productDescription"]//text()').extract()
     item['feature'] = hxs.select(
         '//ul[@class="a-vertical a-spacing-none"]/li/span/text()').extract(
         )
     item['image'] = hxs.select(
         '//span[@class="a-button-text"]/img/@src').extract()
     item['link'] = response.meta["url"]
     item['seller'] = hxs.select(
         '//div[@id="merchant-info"]/a[1]/text()').extract()
     item['sellrating'] = hxs.select(
         '//div[@id="merchant-info"]/text()').extract()
     item['starating'] = hxs.select(
         '//a[@class="a-link-normal"]/i/span/text()').extract()[0]
     item['COD'] = "Available"
     item['category'] = "Mobiles & Tablets"
     item['subcategory'] = "Wearable Devices"
     items.append(item)
     return items

Exemple #43

0

Afficher le fichier

Fichier : portonaquapet.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        name = hxs.select(
            u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract(
            )[0].strip()
        multiple_options = hxs.select(
            u'//select[@class="mpv_itemalst"]//option')
        if multiple_options and not u'requested' in response.meta:
            for option in multiple_options:
                formname = u'aspNetForm'
                formdata = {
                    u'ctl00$MainContent$ItemAList':
                    option.select(u'./@value').extract()[0],
                    u'__EVENTTARGET':
                    u'ctl00$MainContent$ItemAList',
                    u'__EVENTARGUMENT':
                    u''
                }
                req = FormRequest.from_response(response,
                                                formname=formname,
                                                formdata=formdata,
                                                meta={u'requested': True},
                                                dont_click=True,
                                                callback=self.parse_product)
                yield req
        if multiple_options:
            name += u' %s' % multiple_options.select(
                u'../option[@selected]/text()').extract()[0].strip()

        loader = ProductLoader(item=Product(), response=response)

        product_id = hxs.select(
            '//*[@id="ctl00_MainContent_lblLinecode"]/text()').re(r'(\d+)')
        if product_id:
            loader.add_value('identifier', product_id[0])
        else:
            self.log('ERROR: Identifier not found!')

        product_sku = hxs.select(
            '//*[@id="ctl00_MainContent_lblProductCode"]/text()').re(r'(\d+)')
        if product_sku:
            loader.add_value('sku', product_sku[0])
        else:
            self.log('ERROR: SKU not found!')

        product_image = hxs.select('//*[@id="zoom1"]/@href').extract()
        if product_image:
            url = urljoin_rfc(get_base_url(response), product_image[0])
            loader.add_value('image_url', url)

        product_category = hxs.select(
            '//*[@id="papertrail"]/ul/li[1]/a/text()').extract()
        if product_category:
            loader.add_value('category', product_category[0])

        loader.add_value('url', response.url)
        loader.add_value('name', name)
        loader.add_xpath(
            'price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()')
        if not loader.get_output_value('price'):
            loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()')
        if loader.get_output_value('price'):
            yield loader.load_item()

Exemple #44

0

Afficher le fichier

    def parse_product(self, response):

        hxs = HtmlXPathSelector(response)

        sub_items = hxs.select(
            '//div[@class="item-details"]//h3/a/@href').extract()
        if sub_items:
            for sub_item in sub_items:
                url = urljoin(response.url, sub_item)
                yield Request(url, callback=self.parse_product)
            return

        option_links = hxs.select(
            '//form[@id="save-product-to-cart"]//div/ul[contains(@class, "selection-grid")]/li/a/@href'
        ).extract()
        if not response.meta.get('option', False) and option_links:
            for link in option_links:
                url = urljoin(response.url, link)
                yield Request(url,
                              meta={'option': True},
                              dont_filter=True,
                              callback=self.parse_product)
            return

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('url', response.url)

        #== Extracting Identifier and SKU ==#
        tmp = hxs.select('//div[@id="prod-product-code"]/p/text()').extract()
        if not tmp:
            tmp = hxs.select(
                '//div[@id="bundle-product-code"]/p/text()').extract()
        if tmp:
            loader.add_value('identifier', tmp[0])
            loader.add_value('sku', tmp[0])

        #== Extracting Product Name ==#
        try:
            name = hxs.select(
                '//h1[@id="prod-title"]/span/text()').extract()[0].strip()
        except:
            try:
                name = hxs.select(
                    "//div[@class='mod mod-product-info']/h2/text()").extract(
                    )[0].strip()
            except:
                name = hxs.select('//h1[@id="prod-title"]/text()').extract()
                if name:
                    name = name[0].strip()
                else:
                    name = hxs.select(
                        '//h1/span[@itemprop="name"]/text()').extract()
                    if name:
                        name = name[0].strip()
                    else:
                        log.msg('### No name at ' + response.url,
                                level=log.INFO)

        tmp = hxs.select('//div[@class="detail-pair"]/p/text()').extract()
        if tmp:
            name += ', ' + tmp[0]
        loader.add_value('name', name)

        #== Extracting Price, Stock & Shipping cost ==#
        price = 0
        tmp = hxs.select(
            '//div[@class="basket-fields"]/meta[@itemprop="price"]/@content'
        ).extract()
        if not tmp:
            tmp = hxs.select(
                '//section[div[@id="prod-product-code"]]//div[@id="prod-price"]/p//strong//text()'
            ).extract()
            if not tmp:
                tmp = hxs.select(
                    '//div[@id="prod-price"]//span[@itemprop="price"]/text()'
                ).extract()
                if not tmp:
                    tmp = hxs.select(
                        '//strong[@class="price"]/text()').extract()
        if tmp:
            price = extract_price(''.join(tmp).strip().replace(',', ''))
        loader.add_value('price', price)

        try:
            loader.add_xpath('stock', '//div[@data-jl-stock]/@data-jl-stock')
        except ValueError:
            loader.add_value('stock', '0')

        #== Extracting Image URL ==#
        tmp = hxs.select('//li[contains(@class,"image")]//img/@src').extract()
        if tmp:
            url = urljoin(response.url, tmp[0])
            loader.add_value('image_url', url)

        #== Extracting Brand ==#
        tmp = hxs.select('//div[@itemprop="brand"]/span/text()').extract()
        if tmp:
            loader.add_value('brand', tmp[0].strip())

        #== Extracting Category ==#
        tmp = hxs.select('//div[@id="breadcrumbs"]/ol/li/a/text()').extract()
        if len(tmp) > 1:
            loader.add_value('category', ' > '.join(tmp[-3:]))

        product = loader.load_item()

        #== Extracting Options ==#
        options = hxs.select(
            '//div[@id="prod-multi-product-types"]//div[@itemprop="offers"]')
        if not options:
            if not product.get('identifier', None):
                log.msg('### No product ID at ' + response.url, level=log.INFO)
            else:
                if not product['identifier'] in self.id_seen:
                    self.id_seen.append(product['identifier'])
                    yield product
                else:
                    log.msg('### Duplicate product ID at ' + response.url,
                            level=log.INFO)
            return

        #== Process options ==#
        for sel in options:
            item = copy.deepcopy(product)
            tmp = sel.select(
                './/div[contains(@class,"mod-product-code")]/p/text()'
            ).extract()
            if tmp:
                item['identifier'] = tmp[0]
                item['sku'] = tmp[0]
            tmp = sel.select('.//h3/text()').extract()
            if tmp:
                item['name'] = name + ' - ' + tmp[0]

            price = 0
            tmp = sel.select('.//p[@class="price"]/strong/text()').re(
                '[0-9,.]+')
            if not tmp:
                tmp = sel.select('.//strong[@class="price"]/text()').re(
                    '[0-9,.]+')
            if tmp:
                price = extract_price(tmp[0].strip().replace(',', ''))
            item['price'] = price

            tmp = sel.select(
                './/link[@itemprop="availability"]/@content').extract()
            if tmp and 'in' in tmp[0].lower():
                item['stock'] = 1
            else:
                item['stock'] = 0

            if not item.get('identifier', None):
                log.msg('### No product ID at ' + response.url, level=log.INFO)
            else:
                if not item['identifier'] in self.id_seen:
                    self.id_seen.append(item['identifier'])
                    yield item
                else:
                    log.msg('### Duplicate product ID at ' + response.url,
                            level=log.INFO)

Exemple #45

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), response=response)
        name = response.css('h1.fn::text').re('\S+')
        loader.add_value('name', name)
        loader.add_value('url', response.url)
        loader.add_value('brand', response.meta.get('brand', ''))
        categories = hxs.select(
            '//a[@class="bb-crumb__link"]/text()').extract()[3:]
        if not categories:
            categories = hxs.select(
                '//div[@id="breadcrumbs"]//a/text()').extract()[3:]
        loader.add_value('category', categories)
        image_url = hxs.select(
            '//img[@class="Product__img img-responsive"]/@src').extract()
        if not image_url:
            image_url = hxs.select('//img[@class="photo"]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        item = loader.load_item()

        options = response.xpath(
            '//div[@class="SkuList"]/div[contains(@class,"SkuGroup")]')
        if not options:
            options = response.xpath(
                '//div[@id="right-content-prod"]/table[contains(@class, "flavor-table flava-flav")]'
            )
        if options:
            for option in options:
                option_item = deepcopy(item)
                option_name = option.select(
                    './/span[@class="SkuGroup__heading__name"]/text()'
                ).extract()
                if not option_name:
                    option_name = option.select(
                        './tr/td/span/text()').extract()
                option_name = option_name[0].strip()
                option_item['name'] += ' ' + option_name
                price = ''.join(
                    option.select(
                        './/span[@class="SkuGroup__sale-price"]/text()').
                    extract()).strip()
                if not price:
                    price = ''.join(
                        option.select(
                            './tr/td[contains(@class, "size-price")]//span[@class="price"]/text()'
                        ).extract()).strip()
                option_item['price'] = extract_price(price)
                sub_options = option.select('.//tr[td[@class="availability"]]')
                if not sub_options:
                    sub_options = option.select(
                        './/tr[@class="SkuGroup__sku"]')
                if sub_options:
                    for sub_option in sub_options:
                        sub_item = deepcopy(option_item)
                        identifier = sub_option.select(
                            './/meta[@itemprop="sku"]/@content').extract()
                        if not identifier:
                            identifier = sub_option.select(
                                './/form/input[contains(@name, "catalogRefIds") and @value!=" "]/@value'
                            ).extract()
                        sub_item['identifier'] = identifier[0]
                        name = sub_option.select(
                            './/td[@class="SkuGroup__sku__flavor"]/text()'
                        ).extract()
                        if not name:
                            name = sub_option.select(
                                './/td/h5/text()').extract()
                        if name:
                            sub_item['name'] += ' ' + name[0].strip()
                        price = ''.join(
                            sub_option.select(
                                './tr/td[contains(@class, "size-price")]/span/span[@class="price"]/text()'
                            ).extract())
                        if not price:
                            price = ''.join(
                                sub_option.select(
                                    './td[contains(@class, "size-price")]/span[@class="price"]/text()'
                                ).extract())
                        if price:
                            sub_item['price'] = extract_price(price.strip())
                        in_stock = sub_option.select(
                            './/td[@class="SkuGroup__sku__availability" and contains(text(), "In Stock")]'
                        ).extract()
                        if not in_stock:
                            in_stock = sub_option.select(
                                './/td[@class="availability" and contains(text(), "In Stock")]'
                            ).extract()
                        if not in_stock:
                            sub_item['stock'] = 0
                        yield sub_item

Exemple #46

0

Afficher le fichier

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     url = hxs.select('//td[contains(@align,"left")]').select(
         'a/@href').extract()
     print url

Exemple #47

0

Afficher le fichier

Fichier : expeditelectronics_com.py Projet : oceancloud82/scraping

    def parse_product(response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select('//meta[@itemprop="image"]/@content').extract()
        product_identifier = hxs.select(
            '//input[@name="product"]/@value').extract()[0]
        product_name = hxs.select('//div[@class="product-name"]//h1/div/text()'
                                  ).extract()[0].strip()
        price = hxs.select('//meta[@itemprop="price"]/@content').extract()
        price = extract_price(price[0])
        category = hxs.select(
            '//meta[@itemprop="category"]/@content').extract()[0].split('>')
        if not ''.join(category):
            category = hxs.select(
                '//div[@class="breadcrumbs"]/ul/li/a/text()').extract()[2:]

        brand = hxs.select('//meta[@itemprop="brand"]/@content').extract()
        sku = hxs.select('//meta[@itemprop="sku"]/@content').extract()
        stock = hxs.select('//p[@class="availability out-of-stock"]').extract()

        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('identifier', product_identifier)
        product_loader.add_value('name', product_name)
        if image_url:
            product_loader.add_value('image_url',
                                     urljoin_rfc(base_url, image_url[0]))
        product_loader.add_value('sku', sku)
        product_loader.add_value('price', price)
        product_loader.add_value('url', response.url)
        product_loader.add_value('category', category)
        product_loader.add_value('brand', brand)
        if stock:
            product_loader.add_value('stock', 0)
        product = product_loader.load_item()
        yield product

Exemple #48

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        url = response.url
        brand = response.meta.get('brand', '')
        l = ProductLoader(item=Product(), response=response)

        name = hxs.select("//div[@id='pdpProduct']/h1/text()").extract()
        if not name:
            self.log("ERROR! NO NAME! %s" % url)
            log.msg('ERROR! NO NAME!')
            return
        name = name[0].strip()

        if brand.lower() == 'lifetime' and name.lower().find('lifetime') == -1:
            return

        price = hxs.select(
            "//div[@id='pdpPricing']/span[@class='actualprice']/span/text()"
        ).extract()
        if not price:
            self.log("ERROR! NO PRICE! %s %s" % (url, name))
            return
        price = "".join(price)

        sku = hxs.select(
            "//span[@class='identifier']/span[contains(@class, 'partnumber')]/text()"
        ).extract()
        if not sku:
            self.log("ERROR! SKU! %s %s" % (url, name))
            # return
        else:
            l.add_value('sku', sku[0])

        category = ''
        s = hxs.select(
            "//script[contains(text(),'EFFECTIVE_URL')]/text()").extract()
        if s:
            s = s[0].strip()
            pos = s.find('category_root')
            if pos != -1:
                s = s[pos:].split('|')
                if len(s) > 1:
                    category = s[1].replace('+', ' ')
                    l.add_value('category', category)
        if category == '':
            self.log("ERROR! NO Category found! %s %s" % (url, name))

        product_image = hxs.select('//*[@id="mainimage"]/@src').extract()
        if not product_image:
            self.log('ERROR: no product Image found!')
        else:
            image = urljoin_rfc(get_base_url(response),
                                product_image[0].strip())
            l.add_value('image_url', image)

        l.add_value('name', name)
        l.add_value('url', url)
        l.add_value('price', price)
        l.add_value('brand', brand.strip().lower())
        l.add_xpath('identifier', u'//form/input[@name="productId"]/@value')
        product = l.load_item()
        metadata = KeterMeta()
        metadata['brand'] = brand.strip().lower()

        metadata['reviews'] = []
        product['metadata'] = metadata

        reviews_url = 'http://argos.ugc.bazaarvoice.com/1493-en_gb/%s/reviews.djs?format=embeddedhtml'
        # part_number = hxs.select(u'//form/input[@name="partNumber"]/@value').extract()[0]
        part_number = re.search(r'/partNumber/(\d+)', response.url).group(1)
        yield Request(reviews_url % part_number,
                      callback=self.parse_review_page,
                      meta={'product': product})

Exemple #49

0

Afficher le fichier

Fichier : rubbermaid_com.py Projet : oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        shipping_cost = hxs.select(
            './/a[contains(text(), "Delivery Surcharge")]//../..//td[2]//span/text()'
        ).extract()
        if not shipping_cost:
            shipping_cost = hxs.select(
                './/td[contains(text(), "Shipping Surcharge")]//..//td[2]//span/text()'
            ).extract()

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@id="ProductNameH1"]/text()')
        loader.add_value(
            'category',
            hxs.select('//div[@class="breadcrum"]/div/a/text()').extract()[-1])
        loader.add_xpath(
            'identifier',
            '//form//input[@id="hdnProdId" or @name="hdnProdId"]/@value')
        price = hxs.select(
            './/td[contains(text(), "Price:")]//..//td[2]//span/text()'
        ).extract()
        if price:
            loader.add_value('price', price[0])
        else:
            loader.add_value('price', 0)
        try:
            loader.add_value('shipping_cost', shipping_cost[0].strip())
        except:
            pass

        item = hxs.select('//td/strong')
        if item and item[0].select('../text()'):
            loader.add_value(
                'sku', item[0].select('../text()').extract()[1].strip('#() '))

        image_url = hxs.select(
            '//div[@id="divImageBlock"]//img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        loader.add_value('brand', 'Rubbermaid')

        product = loader.load_item()

        product['sku'] = product['sku'].upper()

        metadata = KeterMeta()
        metadata['brand'] = 'Rubbermaid'
        metadata['reviews'] = []
        product['metadata'] = metadata

        self.log('>> BROWSER => GET < %s />' % response.url)
        self._browser.get(response.url)
        self.log('>> OK')

        self.log('>> BROWSER => Looking for more reviews ...')
        try:
            load_more_button = self._browser.find_element_by_xpath(
                '//div[@class="bv-content-pagination"]//button')
            more_reviews = load_more_button.is_displayed()
            max_pages = 25
            while more_reviews and max_pages:
                self.log('>> More reviews found...')
                load_more_button.click()
                self.log('>> BROWSER => CLICK "Load more"')
                time.sleep(20)
                self.log('>> OK')
                load_more_button = self._browser.find_element_by_xpath(
                    '//div[@class="bv-content-pagination"]//button')
                more_reviews = load_more_button.is_displayed()
                max_pages -= 1
            self.log('>> No more reviews...')
        except Exception, e:
            self.log('>> ERROR FOUND => %s' % e)

Exemple #50

0

Afficher le fichier

Fichier : evenstadmusikk.py Projet : oceancloud82/scraping

    def browse_and_parse(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        for subcat_href in hxs.select(
                '//div[@id="navColumnOne"]//a/@href').extract():
            subsubcat_url = urlparse.urljoin(base_url, subcat_href)
            if subsubcat_url not in self.navig_url_set:
                self.navig_url_set.add(subsubcat_url)
                yield Request(subsubcat_url, callback=self.browse_and_parse)

        pages = hxs.select(
            '//div[@id="newProductsDefaultListingTopLinks"]//a/@href').extract(
            )
        for url in pages:
            yield Request(url, callback=self.browse_and_parse)

        # parse product listing in this page, if any
        for product in hxs.select(
                '//table[@class="table-product-attributes"]'):
            product_loader = ProductLoader(item=Product(), response=response)
            url = product.select('.//td[@class="main"]/a/@href').extract()[0]
            product_loader.add_value(
                'identifier',
                re.search(r'products_id=(\d+)', url).groups()[0])
            product_loader.add_value('url', url)
            product_loader.add_value(
                'name',
                product.select(
                    './/td[@class="main"]/a/strong/text()').extract()[0])
            try:
                price = product.select('.//span[@class="table-price"]/text()')\
                    .extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')
            except:
                price = product.select('.//span[@class="productSpecialPrice"]/text()')\
                    .extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')
            product_loader.add_value('price', price)

            yield product_loader.load_item()

        # edge case: product listing page with a single product
        product_price = hxs.select(
            '//h2[@id="productPrices"]/text()').extract()
        if product_price:
            # this product listing page contains a single product
            product_loader = ProductLoader(item=Product(), response=response)

            product_loader.add_xpath('name', '//h1[@id="productName"]/text()')
            product_loader.add_value('url', response.url)
            product_loader.add_value(
                'identifier',
                re.search(r'products_id=(\d+)', response.url).groups()[0])
            try:
                product_loader.add_value('price',
                                         product_price[0].split("-")[0]\
                                         .split(" ")[1].replace('.', '').replace(',', '.'))
            except:
                product_loader.add_value('price',
                                         hxs.select('//span[@class="productSpecialPrice"]/text()').extract()[0]\
                                         .split("-")[0].split(" ")[1].replace('.', '').replace(',', '.'))

            yield product_loader.load_item()

Exemple #51

0

Afficher le fichier

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        image_url = hxs.select(
            '//div[@id="product-image-container"]//img[1]/@src').extract()
        if not image_url:
            image_url = hxs.select(
                '//img[@id="product-main-image"]/@src').extract()
        try:
            product_identifier = hxs.select(
                '//input[@name="product"]/@value').extract()[0].strip()
        except:
            product_identifier = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').re(
                    r'/product/(\d+)')[0]
        product_name = hxs.select(
            'normalize-space(//h1[@class="product-title"]/text())').extract(
            )[0]
        category = hxs.select(
            '//nav[@id="breadcrumd_abbotandknight"]//li/a/text()').extract()
        category = category[-1].strip() if category else ''
        brand = ''
        promotion = False
        feature_names = hxs.select(
            '//*[@id="product-attribute-specs"]//td[@class="feature-title"]/text()'
        ).extract()
        feature_values = hxs.select(
            '//*[@id="product-attribute-specs"]//td[@class="feature-description"]/text()'
        ).extract()
        for name, value in zip(feature_names, feature_values):
            if name.strip() == 'Brand:':
                brand = value.strip()
            elif name.strip() == 'Promotions:' and value.strip() == 'On Sale':
                promotion = True

        options_config = re.search(
            r'var spConfig = new Product.Config\((.*)\)', response.body)
        if options_config:
            product_data = json.loads(options_config.groups()[0])
            products = {}
            for attr in product_data['attributes'].itervalues():
                for option in attr['options']:
                    for product in option['products']:
                        products[product] = ' - '.join(
                            (products.get(product, ''), option['label']))

            for identifier, option_name in products.iteritems():
                product_loader = ProductLoader(item=Product(), selector=hxs)
                product_loader.add_value('identifier',
                                         product_identifier + '_' + identifier)
                product_loader.add_value('name', product_name + option_name)
                if image_url:
                    product_loader.add_value(
                        'image_url', urljoin_rfc(base_url, image_url[0]))
                price = float(product_data['childProducts'][identifier]
                              ['finalPrice']) * 1.2
                product_loader.add_value('price', round(price, 2))
                product_loader.add_value('url', response.url)
                product_loader.add_value('brand', brand)
                product_loader.add_value('category', category)
                product = product_loader.load_item()
                if promotion:
                    metadata = ColourBankMeta()
                    metadata['sold_as'] = 'Promotion'
                    product['metadata'] = metadata
                yield product
        else:
            product_loader = ProductLoader(item=Product(), selector=hxs)
            product_loader.add_value('identifier', product_identifier)
            product_loader.add_value('name', product_name)
            if image_url:
                product_loader.add_value('image_url',
                                         urljoin_rfc(base_url, image_url[0]))
            price = hxs.select(
                '//*[@id="product-price-{}"]/span/text()'.format(
                    product_identifier)).extract()
            if not price:
                price = hxs.select('//*[@id="product-price-{}"]/text()'.format(
                    product_identifier)).extract()
            if price and price[0].strip() == '':
                price = hxs.select(
                    '//*[@id="old-price-{}"]/span/text()'.format(
                        product_identifier)).extract()
            price = extract_price(price[0].strip())
            product_loader.add_value('price', price)
            product_loader.add_value('url', response.url)
            product_loader.add_value('brand', brand)
            product_loader.add_value('category', category)
            product = product_loader.load_item()
            if promotion:
                metadata = ColourBankMeta()
                metadata['sold_as'] = 'Promotion'
                product['metadata'] = metadata
            yield product

        # Related categories
        for url in hxs.select(
                '//div[@id="product-related"]//a/@href').extract():
            yield Request(
                add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit',
                                         'all'),
                self.parse_categories_products)

Exemple #52

0

Afficher le fichier

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        res = {}

        name = hxs.select("//div[@class='product-name']/h1/text()").extract()
        url = response.url
        price = "".join(
            hxs.select(
                "//div[@class='col-right']/div/div[@class='price-block']/span/span[@class='price']/text()"
            ).re(r'([0-9\,\. ]+)')).strip()
        if not price:
            price = "".join(
                hxs.select(
                    "//div[@class='col-right']/div/p[@class='special-price']/span[@class='price']/text()"
                ).re(r'([0-9\,\. ]+)')).strip()
        if not price:
            price = hxs.select('//*[@itemprop="price"]//text()').re(
                r'([\d.,]+)')
        if not price:
            try:
                price_popup_hxs = HtmlXPathSelector(
                    text=re.search(r'realPrice = (.*)', response.body).groups(
                    )[0].replace('\\n', '').replace('\\t', '').replace(
                        '\\', '')[1:-2].strip())
                price = price_popup_hxs.select(
                    '//span[@class="price"]/text()').extract()
            except:
                pass
        try:
            sku = hxs.select("//dd[@class='identifier']/text()")[0].extract()
        except:
            sku = ''
        res['url'] = urljoin_rfc(base_url, url)
        res['description'] = sku + ' ' + name[0].strip()
        res['image_url'] = hxs.select(
            '//a[@id="image-link"]/img/@src').extract()
        category = hxs.select('//div[@class="breadcrumbs"]//a/span/text()')
        if category:
            res['category'] = category[-1].extract()
        res['brand'] = hxs.select('//dd[@class="brand"]/text()').extract()
        # res['sku'] = sku
        res['identifier'] = sku
        sku2 = hxs.select("//div[@class='1']/text()").extract()
        if not sku2:
            sku2_ = 0
        else:
            sku2_ = sku2[0]
        sku3 = hxs.select("//div[@class='2']/text()").extract()
        if not sku3:
            sku3_ = 0
        else:
            sku3_ = sku3[0]
        model = hxs.select("//dd[@class='model']/text()").extract()
        if not model:
            self.log('NO MODEL/SKU => %s' % (res['url'], ))
            model_ = ''
        else:
            model_ = model[0]
        res['sku'] = model_  # Using model field as SKU
        self.csv_writer.writerow(
            [res['sku'], sku2_, sku3_, model_, name[0].strip()])
        options_select = hxs.select(
            '//div[@id="product-options-wrapper"]//select')
        options_radio = hxs.select(
            '//div[@id="product-options-wrapper"]//ul[@class="options-list"]')
        if options_select:
            form_action = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').extract()[0]
            params = dict(
                zip(
                    hxs.select(
                        '//form[@id="product_addtocart_form"]//input/@name').
                    extract(),
                    hxs.select(
                        '//form[@id="product_addtocart_form"]//input/@value').
                    extract()))
            product_data = json.loads(
                re.search(r'var spConfig = new Product.Config\((.*)\)',
                          response.body).groups()[0])
            for product in product_data['attributes'].values():
                attr = product['id']
                super_attr_param = u'super_attribute[%s]' % attr
                option_params = []
                for option in product['options']:
                    opt_params = params.copy()
                    opt_params[super_attr_param] = option['id']
                    option_params.append(opt_params)
                opt_params = option_params.pop()
                yield FormRequest(form_action,
                                  formdata=opt_params,
                                  callback=self.parse_cart,
                                  meta={
                                      'item': res,
                                      'params': option_params,
                                      'form_action': form_action,
                                      'cookiejar': response.meta['cookiejar']
                                  },
                                  dont_filter=True)
        elif options_radio:
            form_action = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').extract()[0]
            params = dict(
                zip(
                    hxs.select(
                        '//form[@id="product_addtocart_form"]//input[not(@type="radio") and not(@disabled)]/@name'
                    ).extract(),
                    hxs.select(
                        '//form[@id="product_addtocart_form"]//input[not(@type="radio") and not(@disabled)]/@value'
                    ).extract()))
            options = zip(
                hxs.select(
                    '//form[@id="product_addtocart_form"]//input[@type="radio" and not(@disabled)]/@name'
                ).extract()[1:],
                hxs.select(
                    '//form[@id="product_addtocart_form"]//input[@type="radio" and not(@disabled)]/@value'
                ).extract()[1:])
            option_params = []
            for option in options:
                opt_params = params.copy()
                opt_params.update({option[0]: option[1]})
                option_params.append(opt_params)
            opt_params = option_params.pop()
            yield FormRequest(form_action,
                              formdata=opt_params,
                              callback=self.parse_cart,
                              meta={
                                  'item': res,
                                  'params': option_params,
                                  'form_action': form_action,
                                  'cookiejar': response.meta['cookiejar']
                              },
                              dont_filter=True)
        elif price:
            res['price'] = price
            yield load_product(res, response)
        else:
            form_action = hxs.select(
                '//form[@id="product_addtocart_form"]/@action').extract()[0]
            params = dict(
                zip(
                    hxs.select(
                        '//form[@id="product_addtocart_form"]//input/@name').
                    extract(),
                    hxs.select(
                        '//form[@id="product_addtocart_form"]//input/@value').
                    extract()))
            yield FormRequest(form_action,
                              formdata=params,
                              callback=self.parse_cart,
                              meta={
                                  'item': res,
                                  'cookiejar': response.meta['cookiejar']
                              },
                              dont_filter=True)

Exemple #53

0

Afficher le fichier

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)

        # Ensure the search matched brand, not some part of name or description
        brand = hxs.select(
            u'//div/div/p/b[contains(text(),"Brand")]/../../../div[2]/p/text()'
        ).extract()
        brand = brand and brand[0].strip().lower()

        # XXX No brand field for some suncast products, but they have brand in name
        if not brand:
            logging.warning('Brand not found [%s]' % response.url)
            brand = ''
            name = hxs.select(u'//h1/text()').extract()[0].strip()
            if response.meta['brand'].lower() in name.lower():
                logging.warning('Assume [%s] from name' %
                                response.meta['brand'])
                brand = response.meta['brand'].lower()

        if 'keter' in brand.lower():
            brand = 'keter'

        if response.meta['brand'].lower() != brand:
            logging.warning(
                'Brand [%s] not equal to search result brand [%s] [%s]' %
                (response.meta['brand'], brand, response.url))
            return

        product_loader = ProductLoader(item=Product(), response=response)
        product_loader.add_xpath('name', u'//h1/text()')
        sku = hxs.select(u'//meta[@property="eb:id"]/@content').extract()[0]
        product_loader.add_value('sku', sku)
        product_loader.add_value('identifier', sku)

        price = hxs.select('//span[@class="ppPrice"]/text()').extract()[0]
        price += hxs.select(
            '//span[@class="ppPrice"]/span/text()').extract()[0]
        product_loader.add_value('price', price)
        product_loader.add_value('brand', brand.lower())
        product_loader.add_xpath('image_url', '//*[@id="jqzoom"]/@href')
        product_loader.add_value('url', response.url)
        product = product_loader.load_item()

        metadata = KeterMeta()
        metadata['brand'] = brand
        metadata['reviews'] = []
        product['metadata'] = metadata
        response.meta['product'] = product

        n_reviews = hxs.select(
            u'//div[@class="prSnippetReadReviews"]/a/text()').extract()

        if n_reviews:
            n_reviews = int(n_reviews[0].split()[1])
            review_sku = hxs.select(
                u'//div[@id="HN_PP"]/@ppskunum').extract()[0]
            # 5 reviews per page
            pages = n_reviews / 5
            if n_reviews % 5 > 0:
                pages += 1
            response.meta['review_sku'] = review_sku
            response.meta['review_pages'] = pages
            response.meta['review_n'] = 1

            yield Request(review_url(response.meta['review_sku'],
                                     response.meta['review_n']),
                          meta=response.meta,
                          callback=self.parse_review)
        else:
            yield product

Exemple #54

0

Afficher le fichier

Fichier : rubbermaid_com.py Projet : oceancloud82/scraping

class RubbermaidSpider(BaseSpider):
    name = 'keter-rubbermaid.com'
    allowed_domains = ['rubbermaid.com']

    start_urls = [
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=shed-accessories',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=VerticalSheds',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=HorizontalSheds',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=DeckBoxesPatioBenches',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=ResinCabinets',
        'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=FastTrackGarageOrganizationSystem'
    ]

    def __init__(self, *args, **kwargs):
        super(RubbermaidSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self._browser = PhantomJS.create_browser()

        max_wait = 60
        self._browser.set_page_load_timeout(max_wait)
        self._browser.set_script_timeout(max_wait)

    def spider_closed(self):
        self._browser.quit()

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        for url in hxs.select(
                '//div[@id="foodStorageBlock"]//a/@href').extract():
            yield Request(urljoin_rfc(base_url, url),
                          callback=self.parse_product)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        shipping_cost = hxs.select(
            './/a[contains(text(), "Delivery Surcharge")]//../..//td[2]//span/text()'
        ).extract()
        if not shipping_cost:
            shipping_cost = hxs.select(
                './/td[contains(text(), "Shipping Surcharge")]//..//td[2]//span/text()'
            ).extract()

        loader = ProductLoader(item=Product(), selector=hxs)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '//h1[@id="ProductNameH1"]/text()')
        loader.add_value(
            'category',
            hxs.select('//div[@class="breadcrum"]/div/a/text()').extract()[-1])
        loader.add_xpath(
            'identifier',
            '//form//input[@id="hdnProdId" or @name="hdnProdId"]/@value')
        price = hxs.select(
            './/td[contains(text(), "Price:")]//..//td[2]//span/text()'
        ).extract()
        if price:
            loader.add_value('price', price[0])
        else:
            loader.add_value('price', 0)
        try:
            loader.add_value('shipping_cost', shipping_cost[0].strip())
        except:
            pass

        item = hxs.select('//td/strong')
        if item and item[0].select('../text()'):
            loader.add_value(
                'sku', item[0].select('../text()').extract()[1].strip('#() '))

        image_url = hxs.select(
            '//div[@id="divImageBlock"]//img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        loader.add_value('brand', 'Rubbermaid')

        product = loader.load_item()

        product['sku'] = product['sku'].upper()

        metadata = KeterMeta()
        metadata['brand'] = 'Rubbermaid'
        metadata['reviews'] = []
        product['metadata'] = metadata

        self.log('>> BROWSER => GET < %s />' % response.url)
        self._browser.get(response.url)
        self.log('>> OK')

        self.log('>> BROWSER => Looking for more reviews ...')
        try:
            load_more_button = self._browser.find_element_by_xpath(
                '//div[@class="bv-content-pagination"]//button')
            more_reviews = load_more_button.is_displayed()
            max_pages = 25
            while more_reviews and max_pages:
                self.log('>> More reviews found...')
                load_more_button.click()
                self.log('>> BROWSER => CLICK "Load more"')
                time.sleep(20)
                self.log('>> OK')
                load_more_button = self._browser.find_element_by_xpath(
                    '//div[@class="bv-content-pagination"]//button')
                more_reviews = load_more_button.is_displayed()
                max_pages -= 1
            self.log('>> No more reviews...')
        except Exception, e:
            self.log('>> ERROR FOUND => %s' % e)

        hxs = HtmlXPathSelector(text=self._browser.page_source)

        for review in hxs.select(
                '//ol[contains(@class, "bv-content-list-Reviews")]//li[contains(@class, "bv-content-review")]'
        ):
            review_loader = ReviewLoader(item=Review(),
                                         selector=review,
                                         date_format='%m/%d/%Y')

            review_loader.add_xpath(
                'date',
                u'.//div[@class="bv-content-datetime"][1]//meta[@itemprop="dateCreated"]/@content'
            )
            review_loader.add_xpath(
                'full_text', u'.//div[@itemprop="reviewBody"]/p/text()')
            review_loader.add_xpath(
                'rating',
                u'.//abbr[contains(@class, "bv-rating-stars-on")][1]/@title')
            review_loader.add_value('url', response.url)

            product['metadata']['reviews'].append(review_loader.load_item())

        yield product

Exemple #55

0

Afficher le fichier

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        row = response.meta['row']
        products = hxs.select('//div[contains(@class, "tyreResult")]')
        for product in products:
            winter = product.select('.//li[@class="cw"]')
            # skip winter tyres
            if winter:
                continue
            loader = ProductLoader(item=Product(), selector=product)
            title = ' '.join(
                map(
                    unicode.strip,
                    product.select('.//div[@class="tyreName"]//text()').
                    extract())).strip()
            brand = product.select('@data-brand').extract()
            if not brand:
                continue
            brand = brand[0].title()
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            title = title[len(brand):].strip()
            results = re.search(
                r"\b((?:\d{2,3}/)*(?:\d{2,3}))\s?([A-Z]{1,2}\d?)\b", title)
            if results:
                load_rating = results.group(1)
                speed_rating = results.group(2)
                name = title[:results.start(1)]
                title = title[results.end(2):]
            else:
                load_rating = ''
                speed_rating = row['Speed rating']
                name = title
                title = ''
            price = ''.join(
                product.select(
                    './/div[@class="tyrePrice"]//text()').extract()).strip()
            loader.add_value('price', price)
            identifier = product.select(
                './/input[@name="id"]/@value').extract()[0]
            loader.add_value('identifier', identifier)
            loader.add_value('url', '')
            image_url = product.select(
                './/div[@class="tyreImage"]/img/@src').extract()
            if image_url:
                loader.add_value('image_url',
                                 urljoin(get_base_url(response), image_url[0]))

            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = speed_rating
            metadata['width'] = row['Width']
            metadata['fitting_method'] = 'Fitted'
            metadata['load_rating'] = load_rating
            # metadata['alternative_speed_rating'] = ''
            metadata['xl'] = 'Yes' if bool(
                product.select('./@data-types').re(r'xl')) else 'No'
            run_flat_found = is_run_flat(title)
            metadata['run_flat'] = 'Yes' if bool(
                product.select('./@data-types').re(
                    r'rf')) or run_flat_found else 'No'
            specif = product.select('.//ul//li/@class').extract()
            man_code = ''
            if 'bmw' in specif:
                man_code = '*'
            elif 'mer' in specif:
                man_code = 'MO'
            elif 'aud' in specif:
                man_code = 'AO'
            elif 'por' in specif:
                man_code = 'NO'

            for code, man_mark in self.all_man_marks.iteritems():
                result, name = cut_name(code, name)
                if result:
                    if man_code == '':
                        man_code = man_mark
                    break
            if man_code == '':
                for code, man_mark in self.all_man_marks.iteritems():
                    result, title = cut_name(code, title)
                    if result:
                        man_code = man_mark
                        break
            metadata['manufacturer_mark'] = man_code
            result, name = cut_name('XL', name)
            loader.add_value('name', name)

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating,
                 speed_rating))
            # metadata['alternative_speed_rating']))

            prod = loader.load_item()
            prod['metadata'] = metadata

            if not is_product_correct(prod):
                continue

            prod['metadata']['mts_stock_code'] = find_mts_stock_code(
                prod, spider_name=self.name, log=self.log)

            yield prod

Exemple #56

0

Afficher le fichier

Fichier : dealsdirectold.py Projet : yupengyan/scraper

    def parse_product(self, response):
        ''' ''' # {{{
        #self.log(response.url, level=log.INFO)

        if response.url == self.store_url + '/':
            return

        hxs = HtmlXPathSelector(response)
        item = ProductItem()

        # Source
        item['source'] = self.store_url

        # Product Name
        # {{{
        tmp = self.extract_xpath(hxs, 'parse_product_product_name')
        if len(tmp) != 1:
            raise ValueError('No Product Name')
        item['product_name'] = tmp[0]
        # }}}

        # Product Number
        # {{{
        tmp = self.extract_xpath(hxs, 'parse_product_product_number_deal') # In Stock
        if len(tmp) == 0:
            tmp = self.extract_xpath(hxs, 'parse_product_product_number_img') # Out of Stock
            if len(tmp) == 0:
                tmp = self.extract_xpath(hxs, 'parse_product_product_number_img_add2wl') # Out of Stock and No Image
                if len(tmp) == 0:
                    tmp = self.extract_xpath(hxs, 'parse_product_product_number_share') # Out of Stock, No Image and no Add2wl
                    if len(tmp) == 0:
                        raise ValueError('No Product Number')

                    re_share = re.compile('m=tell&p=(\d+)')
                    ms = re_share.search(tmp[0])
                    tmp = ms.groups()
                    if len(tmp) == 0:
                        raise ValueError('No Product Number')
                else:
                    re_add2wl = re.compile('pid=(\d+)')
                    ms = re_add2wl.search(tmp[0])
                    tmp = ms.groups()
                    if len(tmp) == 0:
                        raise ValueError('No Product Number')

        item['product_number'] = tmp[0]
        # }}}

        # Description
        # {{{
        tmp = self.extract_xpath(hxs, 'parse_product_description')
        if len(tmp) is 0:
            raise ValueError('No Description')
        else:
            item['description'] = '\n'.join(map(lambda s: s.strip(), tmp)).strip()
        # }}}

        # Categroy Name
        # {{{
        tmp = self.extract_xpath(hxs, 'parse_product_categories')
        if len(tmp) <= 0:
            raise ValueError('No Categories')

        cg_paths = []
        cg_path = []

        for c in tmp:
            c = c.strip()
            if c == '':
                continue
            elif c == 'Home':
                cg_path = ProductItem.CG_PATH_SEP.join(cg_path)
                if cg_path != '':
                    cg_paths.append(cg_path)
                cg_path = ['Home']
            else:
                cg_path.append(c)

        cg_paths.append(ProductItem.CG_PATH_SEP.join(cg_path))
        item['category_name'] = ProductItem.CG_PATHS_SEP.join(cg_paths)
        # }}}

        # Product URL
        item['product_url'] = response.url

        # Image URL
        tmp = self.extract_xpath(hxs, 'parse_product_image_url')
        if len(tmp) is 0:
            raise ValueError('No Image URL')
        else:
            item['image_url'] = self.extract_xpath(hxs, 'parse_product_image_url')[0]

        # Product Condition
        item['product_condition'] = ProductItem.PC_NEW

        # Availability
        tmp = self.extract_xpath(hxs, 'parse_product_availability')
        if len(tmp) != 1:
            raise ValueError('No Availability')
        else:
            tmp = self.AVAIL_CHOICES.get(re.sub('\s+', '', tmp[0].lower()))
            if not tmp:
                raise ValueError('No such Availability')
            item['availability'] = tmp

        # Sale Price
        tmp = self.extract_xpath(hxs, 'parse_product_sale_price')
        tmp = re.sub('[$|\s]', '', ''.join(tmp))
        item['sale_price'] = float(tmp)

        # On Sale
        item['on_sale'] = 0
        if len(self.extract_xpath(hxs, 'parse_product_on_sale_img')) > 0 or len(self.extract_xpath(hxs, 'parse_product_on_sale_save')) > 0:
            item['on_sale'] = 1

        # Currency
        item['currency'] = 'AUD'

        # Manufacturer
        item['manufacturer'] = ''

        ## Optional Field
        # {{{
        #item['gtin'] = None
        #item['mpn'] = None
        #item['product_sku'] = None
        #item['product_spec'] = None
        #item['cost_price'] = None
        #item['num_reviews'] = None
        #item['avg_reviews_points'] = None
        #item['keywords'] = None
        # }}}

        # Shipping Cost
        # Generate a Request to get the Shipping Cost
        request = Request(self.SC_URL % (item['product_number']), callback=self.parse_shipping_cost, dont_filter=True)
        request.meta['item'] = item

        return request

Exemple #57

0

Afficher le fichier

 def parse_categories(self, response):
     hxs = HtmlXPathSelector(response)
     categories = hxs.select('//*[@id="PageMenu"]/div/a/@href').extract()
     for category in categories:
         url = urljoin_rfc(get_base_url(response), category)
         yield Request(url, callback=self.parse_products)

Exemple #58

0

Afficher le fichier

Fichier : kitchenworktopsonline.py Projet : oceancloud82/scraping

 def parse(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     items = hxs.select("//a/@href").extract()
     for item in items:
         yield Request(urljoin_rfc(base_url,item), callback=self.parse_items)

Exemple #59

0

Afficher le fichier

    def parse(self, response):
        # currently not extracting parents that are non-links (smaller parent categories like "resources" and "shops")
        hxs = HtmlXPathSelector(response)

        # # select all categories (bottom level)
        # product_links = hxs.select("//div[@id='container']/div/header//nav/ul[@id='nav']//li/a")

        # select all parent categories
        #parent_links = hxs.select("//div[@id='container']/div/header//nav/ul[@id='nav']//h4/a")
        parent_links = hxs.select(
            "//div[@id='container']/div[@id='header']//nav/ul[@id='nav-touch']//h4/a"
        )

        #TODO: add extraction of level 3 categories (broadest: products, services,...)

        # items = []

        #############################################
        # Extract all categories from sitemap instead of menus on category landing pages (lower level as well)

        # for link in product_links:

        #     # retrieve parent category for this link
        #     parent = link.select("parent::node()/parent::node()/preceding-sibling::node()/a")
        #     item = CategoryItem()

        #     item['text'] = link.select('text()').extract()[0]
        #     item['url'] = link.select('@href').extract()[0]

        #     parent_text = parent.select('text()').extract()
        #     parent_url = parent.select('@href').extract()
        #     if parent_text:
        #         item['parent_text'] = parent_text[0]
        #     if parent_url:
        #         item['parent_url'] = parent_url[0]

        #     # mark it as special if a certain condition is checked
        #     if (link.select("parent::node()/parent::*[@class='nav-res']")):
        #         item['special'] = 1
        #     #TODO: add its direct parent if it's special (not among the categories). ex: shops, resources...

        #     # get grandparent of the category, mark item as special if grandparent is special
        #     grandparent = parent.select("parent::node()/parent::node()/parent::node()/parent::node()")
        #     if not grandparent.select('@class') or grandparent.select('@class').extract()[0] != 'nav-pro':
        #         item['special'] = 1

        #     grandparent_text = grandparent.select('a/text()').extract()
        #     grandparent_url = grandparent.select('a/@href').extract()
        #     if grandparent_text:
        #         item['grandparent_text'] = grandparent_text[0]
        #     if grandparent_url:
        #         item['grandparent_url'] = grandparent_url[0]

        #     item['level'] = 0

        #     items.append(item)
        ###############################################################

        department_id = 0

        for link in parent_links:
            item = CategoryItem()

            item['text'] = link.select('text()').extract()[0]
            item['url'] = link.select('@href').extract()[0]

            item['level'] = 1
            parent = link.select(
                "parent::node()/parent::node()/parent::node()/parent::node()")

            # mark item as special if its parent is special
            if not parent.select('@class').extract() or parent.select(
                    '@class').extract()[0] != "nav-pro":
                item['special'] = 1

            parent_text = parent.select('a/text()').extract()
            # they don't actually have a url, only a #
            #parent_url = parent.select('a/@href').extract()
            if parent_text:
                item['parent_text'] = parent_text[0]

            # if parent_url:
            #     item['parent_url'] = parent_url[0]

            #items.append(item)
            department_id += 1
            request = Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : 1, \
                'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id})
            yield request

Exemple #60

0

Afficher le fichier

 def parse_product(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     res = {}
     options = hxs.select(
         "//select[@name='SKURecNum']/option/text()").extract()
     if options:
         #options
         name = hxs.select(
             "//div[@class='buybox']/table/tr/td/h1/text()").extract()
         if not name:
             name = hxs.select(
                 "//h1[@class='productNameDN']/text()").extract()
         url = response.url
         for option in options:
             try:
                 name2 = re.match(r'(.*) -.*', option.strip()).group(1)
             except:
                 continue
             try:
                 price = re.match(
                     r'.*\xa3(.*)',
                     option.replace("\r", "").replace("\n",
                                                      "").strip()).group(1)
             except:
                 price = None
             if not price:
                 price = "".join(
                     hxs.select(
                         "//p[@class='ProductDetailPrice']/a/font[@class='BodyMain']/text()"
                     ).re(r'\xa3([0-9\,\. ]+)')).strip()
                 if not price:
                     price = "".join(
                         hxs.select(
                             '//p[@class="ProductDetailPrice"]/font[1]/b/text()'
                         ).re(r'\xa3([0-9\,\. ]+)')).strip()
             res['url'] = urljoin_rfc(base_url, url)
             res['description'] = name[0].strip() + u' ' + name2
             res['price'] = price
             yield load_product(res, response)
     else:
         name = hxs.select(
             "//div[@class='buybox']/table/tr/td/h1/text()").extract()
         if not name:
             name = hxs.select(
                 "//h1[@class='productNameDN']/text()").extract()
             if not name:
                 name = hxs.select(
                     "//div[@class='buybox']/table/tr/td/table/tr/td/h1/text()"
                 ).extract()
         url = response.url
         price = "".join(
             hxs.select(
                 "//p[@class='ProductDetailPrice']/strong/a/font[@class='BodyMain']/text()"
             ).re(r'\xa3([0-9\,\. ]+)')).strip()
         if not price:
             price = "".join(
                 hxs.select(
                     "//p[@class='ProductDetailPrice']/a/font[@class='BodyMain']/text()"
                 ).re(r'\xa3([0-9\,\. ]+)')).strip()
             if not price:
                 price = "".join(
                     hxs.select(
                         '//p[@class="ProductDetailPrice"]/strong/font/b/text()'
                     ).re(r'\xa3([0-9\,\. ]+)')).strip()
                 if not price:
                     price = "".join(
                         hxs.select(
                             '//p[@class="ProductDetailPrice"]/font/b/text()'
                         ).re(r'\xa3([0-9\,\. ]+)')).strip()
         res['url'] = urljoin_rfc(base_url, url)
         res['description'] = name[0].strip()
         res['price'] = price
         yield load_product(res, response)