def parse_letv_com(self, response): hxs = HtmlXPathSelector(response) pid = hxs.re('pid:(\d+)')[0] vid = hxs.re('vid:(\d+)')[0] mid = hxs.re('mmsid:(\d+)')[0] # the pv url_t = "http://stat.letv.com/vplay/queryMmsTotalPCount?callback=&cid=1&vid=%s&mid=%s&pid=%s" #print "<<<<<<<<<<<<<<<<<<<<<<<<<<<" #print pid, vid, mid url = url_t % (vid, mid, pid) text = urllib.urlopen(url).read() pv = re.findall('media_play_count.*?(\d+)', text)[0] up = 0 down = 0 # the comments count url_tt = "http://api.my.letv.com/vcm/api/g?jsonp=&type=video¬ice=1&pid=%s&xid=%s&mmsid=%s&rows=10&page=1" url2 = url_tt % (pid, vid, mid) text2 = urllib.urlopen(url2).read() comments = re.findall('total.*?(\d+)', text2)[0] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_list_page(self, response): multi_xpath = '//div[@class="supply-cell" or @class="supply-cell supply-cell-bg"]' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: shop_name = ''.join(hxs.select('./div/div/span/a//text()').extract()) shop_name = clean_string(shop_name) shop_site_url = ''.join(hxs.select('./div/div/span/a[1]/@href').extract()) shop_site_url = urllib.unquote(shop_site_url).strip() detail_url = shop_site_url doc = { 'shop_name': shop_name, 'shop_site_url': shop_site_url, } query = response.meta['query'] list_url = response.url if not shop_site_url: next_request = None else: headers = { 'referer': shop_site_url } next_request = Request(detail_url, headers=headers, callback=self.parse_about_page) item = LegItem(collection=self.collection, doc=doc, next_request=next_request, list_url=list_url, query=query) yield self.item_or_request(item)
def item_parse(self, response): hxs = HtmlXPathSelector(text=response.body) page_ele = hxs.select('//div[@class="memo"]/text()') if len(page_ele) > 0: text = page_ele[0].extract() result = re.search(ur"北京时间(.+) 在(.+)\((.+),(.+)\) 发生(.+)级地震,震源深度(.+)公里", text, re.UNICODE) #print result.group(1), result.group(2), result.group(3), result.group(4), result.group(5), result.group(6) epoch = time.mktime(time.strptime(result.group(1), "%Y-%m-%d %H:%M")) if (epoch < self.last_timestamp): return name = result.group(2) magnitude = result.group(5) depth = result.group(6) latitude_re = re.search(r"([^\d]+)(\d+\.\d+)", result.group(3)) latitude = float(latitude_re.group(2)) if latitude_re.group(1) == u"北纬" else 0 - float(latitude_re.group(2)) longtitude_re = re.search(r"([^\d]+)(\d+\.\d+)", result.group(4)) longtitude = float(longtitude_re.group(2)) if longtitude_re.group(1) == u"东经" else 0 - float(longtitude_re.group(2)) #print latitude, longtitude db_file = self.db_file conn = sqlite3.connect(db_file) c = conn.cursor() print "insert %s to %f, %f\n" % (name, latitude, longtitude) c.execute("INSERT OR REPLACE INTO quake (name, longtitude, latitude, timestamp, depth, magnitude, source_url) VALUES (?,?,?,?,?,?,?)", [name, longtitude, latitude, epoch, depth, magnitude, response.url] ) conn.commit()
def parse_sohu_com(self, response): hxs = HtmlXPathSelector(response) vid = ''.join(hxs.re('var vid="(\d+)')).strip() pid = ''.join(hxs.re('var playlistId="(\d+)')).strip() cid = ''.join(hxs.re('var cid="(\d+)')).strip() # msg = "sohu id: vid %s, pid %s, cid %s" % (vid, pid, cid) # self.log(msg) url_t = "http://count.vrs.sohu.com/count/stat.do?videoId=%s&playlistId=%s&categoryId=%s" url1 = url_t % (vid, pid, cid) text = urllib.urlopen(url1).read() pv = ''.join(re.findall('(\d+)', text)) url_t1 = "http://score.my.tv.sohu.com/digg/get.do?vid=%s&type=%s" url1 = url_t1 % (vid, cid) text = urllib.urlopen(url1).read() t = text[text.find('{'): text.rfind('}') + 1] dj = json.loads(t) up = dj['upCount'] down = dj['downCount'] url_t2 = "http://access.tv.sohu.com/reply/list.do?objid=%s&subobjid=%s&objtype=%s" url2 = url_t2 % (pid, vid, cid) text = urllib.urlopen(url2).read() comments = re.findall('"allCount":(\d+)', text)[0] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def detail(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) variants_price=hxs.select("//div[@class='fleft catbox pricerate']//span/text()").extract() variants_seller=hxs.select("//div[@class='catbox fleft storeimage']/img/@alt").extract() quantitylist=[] pricelist=[] items=[] if (len(variants_price)!=0 or variants_price!=None) and (len(variants_seller) or variants_seller!=None): for price, seller in zip(variants_price, variants_seller): item = BillionPricesIndiaItem() item['date'] = time.strftime("%d/%m/%Y") item['vendor'] = seller.split(" ")[-1:][0] item['product'] = response.url.split('/')[-1].split(".")[0] itemprice=re.sub('[,]', '', price).split(" ")[-1:][0] item['category'] = "mobiles" item['price'] = float(itemprice) item['quantity'] = '1' item['measure']= 'pcs' item['unitprice']=float(itemprice) items.append(item) return items
def parse(self,response): #Get Access Token for Microsoft Translate atrequest = urllib2.Request('https://datamarket.accesscontrol.windows.net/v2/OAuth2-13') atrequest.add_data(atdata) atresponse = urllib2.urlopen(atrequest) access_token = json.loads(atresponse.read())['access_token'] hxs = HtmlXPathSelector(response) sites = hxs.select('//span[contains(@class, "productsAzLink")]/a/text()').extract() items = [] for site in sites: text = [] item = IkeaItem() item['name'],_,item['thing'] = unicode(site).partition(' ') tosend = {'text': unicode(item['name']), 'from' : 'sv' , 'to' : 'en' } request = urllib2.Request('http://api.microsofttranslator.com/v2/Http.svc/Translate?'+urllib.urlencode(tosend)) request.add_header('Authorization', 'Bearer '+access_token) response = urllib2.urlopen(request) doc = etree.fromstring(response.read()) for elem in doc.xpath('/foo:string', namespaces={'foo': 'http://schemas.microsoft.com/2003/10/Serialization/'}): if elem.text: elem_text = ' '.join(elem.text.split()) if len(elem_text) > 0: text.append(elem_text) item['translation'] = ' '.join(text) items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) for div in hxs.select('//div[@id="contem_boxes"]'): titulo = div.select('.//div[@id="contem_titulo"]/text()').extract()[0] if not titulo.endswith(u'mara dos Deputados/BR'): continue else: reg = re.compile('<a class="listapar" href="(?P<url>.*?)">(?P<name>[\w\s]*[\w]+)\s*\(<b>[\w\s]+</b>\)\s-\s(?P<party>.*?)\/(?P<state>.*?)</a><br>', flags=re.U) for r in reg.finditer(div.extract()): dict_deputy = r.groupdict() #if dict_deputy['state'] in settings['STATE_TO_FILTER']: db_deputy = self.api.get_deputado_por_nome(dict_deputy['name']) if not db_deputy: dep = Deputado(dict_deputy['name'], dict_deputy['state'], dict_deputy['party']) self.api.inserir_deputado(dep) else: dep = db_deputy[0] id = urlparse.parse_qs(urlparse.urlparse(dict_deputy['url']).query).get('id', [0])[0] if not id: continue request = Request(urljoin(self.base_url, '@presencas.php?id=%s' % id), callback=self.parse_deputy_assiduity) request.meta['dep'] = dep yield request request = Request(urljoin(self.base_url, '@uso_verbas_als.php?uf=16&id=%s' % id), callback=self.parse_deputy_costs) request.meta['dep'] = dep yield request
def browse_and_parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for subcat_href in hxs.select('//div[@id="navColumnOne"]//a/@href').extract(): subsubcat_url = urlparse.urljoin(base_url, subcat_href) if subsubcat_url not in self.navig_url_set: self.navig_url_set.add(subsubcat_url) yield Request(subsubcat_url, callback=self.browse_and_parse) next_page = hxs.select("//div[@id='productListing']//div[@id='productsListingListingTopLinks']//a[contains(., 'Neste')]/@href") if next_page: yield Request(next_page[0].extract(), callback=self.browse_and_parse) # parse product listing in this page, if any for tr in hxs.select('//div[@id="productListing"]//tr[@class="productListing-even" or @class="productListing-odd"]'): product_loader = ProductLoader(item=Product(), response=response) product_loader.add_value('url', tr.select(".//td[2]//a/@href").extract()[0]) product_loader.add_value('name', tr.select(".//td[2]//a/text()").extract()[0]) product_loader.add_value('price', tr.select(".//td[3]/text()").extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item() # edge case: product listing page with a single product product_price = hxs.select('//h2[@id="productPrices"]/text()').extract() if product_price: # this product listing page contains a single product product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@id="productName"]/text()') product_loader.add_value('url', response.url) product_loader.add_value('price', product_price[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item()
def parse(self, response): connection = pymongo.MongoClient("localhost", 30000) db = connection.academic hps = db.homepages tmp = hps.find_one({"url": response.url}) if not tmp: hxs = HtmlXPathSelector(response) urls = hxs.select('//a') contents = hxs.select('//p | //a | //b | //tr | //td | //li | //ul | //font | //span | //strong | //h1 | //h2 | //h3') link = [] text = "" for url in urls: u = ''.join(url.select('@href').extract()) if u[-4:] == ".pdf": link.append(u) for content in contents: s = ''.join(content.select('text()').extract()) if len(s) > 3: text += s hp = { "url" : response.url, "link" : link, "text" : text } print "[insert]" hps.insert(hp) else: print "[redundent]"
def parse_item(self, response): # we use this to run XPath commands against the HTML in the response hxs = HtmlXPathSelector(response) # this is the base XPath string for the element that contains the recipe # info base_path = """//div[@class="recipe-details"]""" # the select() method will return a list of HtmlXPathSelector objects. # On this site we will almost certainly either get back just one, if # any exist on the page recipes_scopes = hxs.select(base_path) # it's easier to define these XPath strings outside of the loop below name_path = '//h1[@itemprop="name"]/text()' recipeYield_path = '//label[@for="set_servings"]/input/@value' description_path = '//span[@itemprop="summary"]/p/text()' image_path = '//img[@class="the_recipe_image"]/@src' cookTime_path = '//form/p/time[@itemprop="cookTime"]/@datetime' prepTime_path = '//form/p/time[@itemprop="prepTime"]/@datetime' ingredients_path = '//span[@itemprop="ingredient"]' ingredients_amounts_path = './span[@itemprop="amount"]/text()' ingredients_names_path = './span[@itemprop="amount"]/text()' datePublished_path = '//span[@itemprop="published"]/@datetime' # init an empty list recipes = [] # loop through our recipe scopes and extract the recipe data from each for r_scope in recipes_scopes: # make an empty RecipeItem item = RecipeItem() item['source'] = self.source item['name'] = r_scope.select(name_path).extract() item['image'] = r_scope.select(image_path).extract() item['description'] = r_scope.select(description_path).extract() item['url'] = response.url item['prepTime'] = r_scope.select(prepTime_path).extract() item['cookTime'] = r_scope.select(cookTime_path).extract() item['recipeYield'] = r_scope.select(recipeYield_path).extract() item['datePublished'] = r_scope.select(datePublished_path).extract() # Simpler to grab the amount and name spans separately, # then combine them into a string. ingredient_scopes = r_scope.select(ingredients_path) amount = ingredient_scopes.select(ingredients_amounts_path).extract() name = ingredient_scopes.select(ingredients_names_path).extract() ingredients = [" ".join(ing).encode('utf-8') for ing in zip(amount, name)] item['ingredients'] = ingredients # stick this RecipeItem in the array of recipes we will return recipes.append(item) # more processing is done by the openrecipes.pipelines. Look at that # file to see transforms that are applied to each RecipeItem return recipes
def parse_list(self, response): hxs = HtmlXPathSelector(response) for href in hxs.select(r'//ul[@id="paper-listing"]//a/@href').extract(): yield Request(urlparse.urljoin(response.url, href), callback=self.parse_paper) next = hxs.select(r'//div[@class="pagination"]/ul/li[@class="next"]/a/@href') if len(next): yield Request(urlparse.urljoin(response.url, next[0].extract()), callback=self.parse_list)
def parse(self, response): print(response) hxs = HtmlXPathSelector(response) # forms = hxs.select('//frame') #print(forms) forms = hxs.select('//form') print(forms) actions = [] items = [] for form in forms: formItem = FormItem() a = form.select("@action") formItem["actionURL"] = a.extract() items.append(formItem) inputs = form.select(".//input") inputItems = [] for formInput in inputs: inputName = formInput.select("@name").extract() inputSize = formInput.select("@size").extract() inputMaxlength = formInput.select("@maxlength").extract() inputValue = formInput.select("@value").extract() inputType = formInput.select("@inputType").extract() inputItem = InputItem() inputItem["name"] = inputName inputItem["size"] = inputSize inputItem["maxlength"] = inputMaxlength inputItem["value"] = inputValue inputItem["inputType"] = inputType inputItems.append(inputItem) formItem["inputs"] = inputItems return items
def parse_start_url(self, response): x = HtmlXPathSelector(response) # get the list of posters posters = x.select("//b[@class='postauthor']/text()").extract() # set the first in list of posters as topic author op = posters[0] # get the topic url and title url = response.url title = x.select("//div[@id='pageheader']/h2/a/text()").extract() #scrape topic body #But this scrape is not quite working. It is posting #the entire body of the page and not just the specific loop. #I am not sure why at all post_body = x.select("//div[@class='postbody']").extract() # go through list of posters and remove any duplicates posters_export = [op] for p in posters: posters_export.append(p) # create an item for each unique poster in the topic topics = [] for i, pb in enumerate(post_body): topic = BodytestItem() topic['topic_url'] = url topic['topic_title'] = title topic['thread_author'] = op topic['post_author'] = posters[i] topic['post_body'] = pb topics.append(topic) return topics
def result_page(self,response): hxs=HtmlXPathSelector(response) #inspect_response(response) store_details=hxs.select('//*[@id="cb_block_inner"]/table/tr//td') url=response.url state=response.meta['state'] for store_detail in store_details: if ''.join(store_detail.select('.//text()').extract()).strip(): detials=[det.strip() for det in store_detail.select('.//text()').extract()] store_name=detials[1] address=[] for det in detials[2:]: if det: address.append(det) else: index_break=detials[2:].index('')+2 break if address: if len(address)==1: address=address[0] location=None else: location=address[-1] address=address[:-1] res='!'.join(detials[index_break:]) if 'Contact' in res: contact_name=res[res.find('t!: ')+4:res.find('!!')] ph=re.compile('\(\d+\).\d+\-\d+').findall(res) if ph: phone_no=ph[0] else: phone_no=None item=NssfcrawlerItem(state=state,store_name=store_name,address=address,location=location,contact_name=contact_name,phone_no=phone_no) yield item
def parse(self, response): """ The lines below is a spider contract. For more info see: http://doc.scrapy.org/en/latest/topics/contracts.html @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/ @scrapes name """ hxs = HtmlXPathSelector(response) # XPath for an item looks like this: # //*[@id="bd-cross"]/fieldset[3]/ul/li[1] sites = hxs.select('//ul/li') items = [] for site in sites: item = Website() # item['name'] = site.select('a/text()').extract() # item['url'] = site.select('a/@href').extract() # item['description'] = site.select('text()').re('-\s([^\n]*?)\\n') item['name'] = site.select('a/text()').extract() item['url'] = site.select('a/@href').extract() # item['description'] = site.select('text()').extract() item['description'] = [str.strip() for str in site.select('text()').extract()] items.append(item) return items
def parse_item(response): #~self.log('Hi, this is an item page! %s' % response.url) #~print 'function' #~print sys._getframe().f_code.co_name #~print 'response' #~print response #~print type(response) #~print "response.url" #~print response.url hxs = HtmlXPathSelector(response) # ATM, all these item values are coming in a List type with just the 0th key item = CompuindiaItem() item['sourceurl'] = [ response.url ] #~item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract() # Code: Unicode item['code'] = hxs.select('//td[@class="data"]/text()')[0].extract().encode('utf-8') # Code: String item['price'] = hxs.select('//span[@class="price"]/text()')[0].extract().encode('utf-8') # left item['color'] = [None] # Try to do matching with class="last odd" #~item['color'] = hxs.select('//tbody/tr[@class="last odd"]') item['name'] = hxs.select("//div[@class='product-name']/h1/text()").extract()[0] #~item['features'] = hxs.select('//ul[@class="config_listing_pd_page"]/li').extract() item['features'] = hxs.select('//ul[@class="config_listing_pd_page"]/li/text()').extract() #~item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract() item['specs'] = hxs.select('//div[@class="box-collateral box-additional"]').extract()[0].encode('utf-8') #~item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract() item['description'] = hxs.select('//div[@class="box-collateral box-description"]').extract()[0].encode('utf-8') item['moreDescription'] = [None] #~item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract() item['additionalInfo'] = hxs.select('//div[@id="additional"]').extract()[0].encode('utf-8') item['relatedProducts'] = [None] # FTM #IMAGES main_img = [] image_urls = [] main_img = hxs.select("//p[@class='product-image']/a/@href").extract() img_urls = hxs.select("//div[@class='more-views']/ul/li/a/@href").extract() item['image_urls'] = list( set( main_img + img_urls ) ) #IMAGES- #~print 'item' #~print item #~ #~sys.exit('S') return item
def parse_page2(self, response): # data stored global writer sel = HtmlXPathSelector(response) # article = ''.join(sel.xpath('//div[@class="body yom-art-content clearfix"]').extract()) article = ''.join(sel.xpath('//p/text()').extract()) subheadline = ''.join(sel.xpath('//h2[@class="subheadline"]/text()').extract()) str2 = ''.join(sel.xpath('//abbr/text()').extract()) millis = int(round(time.time() * 1000)) # Get the current time in milliseconds ntime = 0.0 if "hour" in str2[1]: str3 = str2[1].split(" ") ntime += float(str3[0]) * 60 if "minute" in str2[1]: ntime += float(str3[2]) elif "minute" in str2[1]: str3 = str2[1].split(" ") ntime += float(str3[0]) # article_time = datetime.datetime.fromtimestamp((millis - ntime * 60 * 1000)/1000).strftime('%m-%d-%Y %H:%M:%S.%f') articletime = " " # Grabs the information from parse function title = response.meta['Title'] linktime = response.meta['LinkTime'] source = response.meta['Source'] link = response.meta['Link'] # Stores everything in a CSV file Consumer.writer.writerow([title.encode("utf-8"), subheadline.encode("utf-8"), source.encode("utf-8"), linktime.encode("utf-8"), articletime.encode("utf-8"), article.encode("utf-8"), link.encode("utf-8")])
def parse(self, response): hxs = HtmlXPathSelector(response) # x_path test if self.scraper.checker_type == '4': self.log("No 404. Item kept.", log.INFO) return try: test_select = hxs.select(self.scraper.checker_x_path).extract() except ValueError: self.log('Invalid checker x_path!', log.ERROR) return if len(test_select) > 0 and self.scraper.checker_x_path_result == '': self.log("Elements for XPath found on page (no result string defined).", log.INFO) if self.conf['DO_ACTION']: self._del_ref_object() return elif len(test_select) > 0 and test_select[0] == self.scraper.checker_x_path_result: self.log("XPath result string '" + self.scraper.checker_x_path_result + "' found on page.", log.INFO) if self.conf['DO_ACTION']: self._del_ref_object() return else: self.log("XPath result string not found. Item kept.", log.INFO) return
def parse(self, response): hxs = HtmlXPathSelector(response) #categories = hxs.select('//div[@class="sidebar_nav"]//li/a/@href').extract() categories = hxs.select('//div[@class="navigation"]/ul/li/a/@href').extract() categories += hxs.select('//ul[@class="cl_subs"]//a/@href').extract() loaded = False for category in categories: loaded = True yield Request(category) next_page = hxs.select('//a[@rel="next"]/@href').extract() if next_page: base_url = get_base_url(response) loaded = True yield Request(urljoin_rfc(base_url, next_page[0])) products = [product for product in self.parse_products(hxs)] for product in products: yield product if (not products or not loaded) and response.meta.get('retries', 0) < 3: yield Request(response.url, dont_filter=True, meta={'retries': response.meta.get('retries', 0) + 1})
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select('//td[@class="ProductDetails"]/h1/text()').extract() if name: name = name[0].strip() url = response.url url = urljoin_rfc(get_base_url(response), url) items = hxs.select('//div[@class="Item"]') for item in items: loader = ProductLoader(item=Product(), selector=item) loader.add_value('url', url) #loader.add_value('name', name[0]) sku = ''.join(item.select('./text()').extract()) n = name if sku: n += ' ' + sku.strip() loader.add_value('name', n) loader.add_xpath('price', './/span[@class="price"]/text()') loader.add_xpath('price', './div[@class="price"]/span/text()') yield loader.load_item()
def parse(self, response): """ Default callback used by Scrapy to process downloaded responses Testing contracts: @url http://www.livingsocial.com/cities/15-san-francisco @returns items 1 @scrapes title link """ selector = HtmlXPathSelector(response) for deal in selector.xpath(self.deals_list_xpath): loader = XPathItemLoader(LivingSocialDeal(),selector=deal) # define processors loader.default_input_processor = MapCompose(unicode.strip) # stripe out white-space of unicode strings loader.default_output_processor = Join() # join the data together by a space # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): # iteritems() iterate the (key,value) of items in a dictionary. There are also iterkeys() and itervalues() functions. loader.add_xpath(field, xpath) yield loader.load_item() # yield each other and move on to the next # output as json file: scrapy crawl livingsocial -o items.json
def parse_products(self, response): hxs = HtmlXPathSelector(response) products = hxs.select('//div[@class="prod"]') for product in products: loader = ProductLoader(item=Product(), selector=product) #loader.add_xpath('name', 'div/form/fieldset/div/h5/a/span/text()') name = product.select('div/form/fieldset/div/h5/a/span/text()').extract()[0].strip() url = product.select('div/form/fieldset/div/h5/a/@href').extract() if url: url = urljoin_rfc(get_base_url(response), url[0]) #loader.add_value('url', url) #loader.add_xpath('price', 'div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()') #yield loader.load_item() price = product.select('div/form/fieldset/div/span[@class="productPrice priceExVAT"]/text()').extract()[0].strip() yield Request(url, callback=self.parse_product, meta={'name':name, 'price':price}) pages = hxs.select('//span[@class="pagingButton"]/a/@href').extract() if pages: if response.meta['do_pagination']: for page in pages: url = urljoin_rfc(get_base_url(response), page) yield Request(url, callback=self.parse_products, meta={'do_pagination':False}) else: sub_categories = hxs.select('//div[@class="subcat"]/div/a/@href').extract() for sub_category in sub_categories: url = urljoin_rfc(get_base_url(response), sub_category) yield Request(url, callback=self.parse_products, meta={'do_pagination':True})
def parse(self, response): hxs = HtmlXPathSelector(response) magic_sets_full = hxs.select('//div[@class="left_block"]//ul[@class="left_menu"]//li/a/text()').extract() links_to_magic_sets_full = hxs.select( '//div[@class="left_block"]//ul[@class="left_menu"]//li/a/@href' ).extract() # lets cut first category for debuging purposes: magic_sets = magic_sets_full[0] links_to_magic_sets = links_to_magic_sets_full[0] # self.log("This is first category and link to they: %s, %s, %s" % (type(magic_sets), magic_sets, links_to_magic_sets)) # Now all magic sets are all together with the links to them: # uncoment this after debug: # magic_sets_zip = dict(zip(magic_sets, links_to_magic_sets)) magic_sets_zip = dict([[magic_sets, links_to_magic_sets]]) date_prefix = time.strftime("%Y%m%d", time.localtime()) try: os.mkdir("./archive/HTML/" + date_prefix) except OSError: self.log("The folder exists!") filename = "./archive/HTML/" + date_prefix + "/" + response.url.split("/")[-1] + ".htm" self.log("This is filename for index: %s" % (filename,)) try: open(filename, "wb").write(response.body) except OSError: os.remove(filename) open(filename, "wb").write(response.body) # Continue to extract data: for magic_set, url in magic_sets_zip.iteritems(): abs_url = urljoin("http://www.blackborder.com", url) self.log("This is magic set name and url to it: %s ---> %s" % (magic_set, abs_url)) request = Request(abs_url, callback=self.parse_set_page) request.meta["magic_set"] = magic_set request.meta["date_prefix"] = date_prefix yield request
def getComments(self, response): Item = response.meta['item'] res_text = response.body_as_unicode().encode('ascii', 'ignore') res_text = smart_str(self.parser.unescape(self.parser.unescape(res_text))).replace('\xc2\xa0','') res_text = res_text.replace('\n', ' ').replace('\t', ' ').replace('\r', '') res_text = re.subn('<script.*?</script>', '', res_text)[0] res_text = re.subn('<style.*?</style>', '', res_text)[0] hxs = HtmlXPathSelector(text=res_text) tmp = hxs.select('//div[@id="ds_div"]//text()').extract() comments = '' for val in tmp: val = val.strip() if val != '': comments += val + ' ' Item['Comments'] = comments try: offers_url = 'http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=' + Item['eBay_Item_Number'] if Item['eBay_Item_Number'] != 'NA' and Item['eBay_Item_Number'] != '': req = Request(offers_url, dont_filter=True, callback=self.getPostingDate) req.meta['item'] = Item return req except: pass return Item
def parse(self, response): log.msg(response.url) hxs = HtmlXPathSelector(response) items=[] variants_date=hxs.select("//span[@class='normal']//text()").extract() variants_price=hxs.select("//table[@id='objContPreviousPrices_grdPreviousPrices']//tr//td[@class='normal']//text()").extract() price_items=self.__group_iter(variants_price,4) av_price=[] for price_list in price_items: av_price.append(reduce(lambda x, y: float(x) + float(y) / float(len(price_list)), price_list, 0)) for price, date in zip(variants_price, variants_date): item = BillionPricesIndiaItem() quantity='1 lt' item['date'] = date item['vendor'] = "ioc" item['product'] = "gasoline" item['category'] = "oil and gas" value,measure,unitprice=self.__unit_price(price,quantity) item['price'] = price item['quantity'] = value item['measure']= measure item['unitprice']=unitprice items.append(item) return items
def parse_qq_com(self, response): hxs = HtmlXPathSelector(response) pid = ''.join(hxs.re('id :"(\w+)",')) vid = ''.join(hxs.re('vid:"(\w+)",')) url_t_1 = "http://sns.video.qq.com/tvideo/fcgi-bin/batchgetplaymount?id=%s&otype=json" u1 = url_t_1 % (pid,) t1 = urllib.urlopen(u1).read() pv = ''.join(re.findall('"num":(\d+)', t1)).strip() url_t_2 = "http://sns.video.qq.com/tvideo/fcgi-bin/spvote?&t=3&otype=json&keyid=%s" u2 = url_t_2 % (vid,) t2 = urllib.urlopen(u2).read() tmp = re.findall('"num":(\d+)', t2) down, up = tmp url_t_3 = "http://sns.video.qq.com/fcgi-bin/liveportal/comment?otype=json&p=1&t=0&sz=10&id=%s" u3 = url_t_3 % (pid,) t3 = urllib.urlopen(u3).read() comments = ''.join(re.findall('"totpg":(\d+)', t3)) item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_youku_com(self, response): hxs = HtmlXPathSelector(response) video_id = hxs.re('var videoId.*?(\d+)')[0] url_t = "http://v.youku.com/v_vpactionInfo/id/%s" url = url_t % (video_id,) text = urllib.urlopen(url).read() hxs2 = HtmlXPathSelector(text=text) pv = hxs2.select('//ul[@class="row"]//span[@class="num"]/text()').extract()[0] pv = int(''.join(pv.split(','))) # others data d_tmp = hxs2.select('//ul[@class="half"]//span/text()').extract() # up and down data ud = d_tmp[0] up, down = d_tmp[0].split('/') up, down = int(''.join(up.split(','))), int(''.join(down.split(','))) # comments count comments = int(''.join(d_tmp[2].split(','))) item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse(self, response): hxs = HtmlXPathSelector(response) # Only URL with "/pay-as-you-go/" and has "manufacturer=" in the query categories = hxs.select('//map[@name="Mapre"]/area[contains(@href,"/pay-as-you-go/") and contains(@href,"manufacturer=")]/@href').extract() for category in categories: url = urljoin_rfc(response.url, category, response.encoding) yield Request(url, callback=self.parse_cat)
def parse_sina_com_cn(self, response): hxs = HtmlXPathSelector(response) vid = hxs.re('vid:.*?(\d+)\|\d+')[0] nid = hxs.re("newsid:'([-\w]+)")[0] url_t = "http://count.kandian.com/getCount.php?vids=%s&action=flash" url = url_t % ("%s-%s" % (vid, vid)) data = urllib.urlopen(url).read() pv = re.findall('\d+":"(\d+)', data)[0] up = 0 down = 0 url_tt = "http://comment5.news.sina.com.cn/cmnt/info_wb?channel=movie&newsid=%s&page=1&callback=" url2 = url_tt % (nid,) data2 = urllib.urlopen(url2).read() data2 = data2[1:-1] dj = json.loads(data2) comments = dj["result"]['data']['total_number'] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse(self, response): hxs = HtmlXPathSelector(response) comments = hxs.select('//script[contains(text(),"t_post")]') #filename = response.url.split("/")[-2] self.file.write('comments: ' + str(len(comments)) + '\n\n') #items = [] for comment in comments: #pattern = re.compile(r"'?([^(,]+)'?,") pattern = re.compile(r"('(.*?)'|(\d+),)", re.S) results = pattern.findall(comment.extract()) comment_items = list((x[2] if x[2] else x[1]) for x in results) item = IxbtItem() if len(comment_items) > 5: text = comment_items[5] item['grats'] = len(text.split(';')) else: item['grats'] = 0 item['text'] = [] text = '' if len(comment_items) > 4: text = comment_items[4]; text = re.sub(r'<br>', '\n', text) text = re.sub(r'<p>.*<p>', '\n', text) text = re.sub(r'\\n', '\n', text) #text = re.sub(r'\<.*', '', text) #text = re.sub(r'\<[^>]*\>', '', text) text = re.sub(r'(\n|^).{1,20}(\n)+', '\n', text) #text = re.sub(r'(\n){3,}', '\n\n', text) #text = re.sub(r'\s+$', '', text) #text = re.sub(r'^\s+', '', text) pattern = re.compile(r'(.+?)(\n\n|$)', re.S) tuples = pattern.findall(text) item['text'] = list(x[0].strip() for x in tuples if len(x[0].strip()) > 12) item['author'] = comment_items[1] item['url'] = response.url + u'#' + comment_items[0] if item['grats'] > 2: self.file.write('Автор: ' + item['author'].encode('UTF-8') + '\n') self.file.write(str(item['grats']) + ' человек сказали спасибо\n') self.file.write(item['url'] + '\n') s = '\n'.join(item['text']) self.file.write('кол-во анекдотов: ' + str(len(item['text'])) + '\n') #self.file.write(comment_items[4].encode('UTF-8')) for joke in item['text']: self.file.write(joke.encode('UTF-8') + '\n\n') #items.append(item) yield item next_url = hxs.select('//script[contains(text(),"t_assign")]').re(u'href=([^ ]*?)>далее') if len(next_url) > 0: next_url = next_url[0] parsed_url = urlparse(next_url) next_url = urljoin(response.url, next_url) yield Request(next_url, callback=self.parse) self.file.write("Следующая страница: " + next_url.encode('UTF-8') + '\n')
def parse_categories(self, response): hxs = HtmlXPathSelector(response) urls = hxs.select('//*[@id="categorylist"]/ul[@class="categories"]/li/h2/a/@href').extract() for url in urls: yield Request(url, callback=self.parse_products)
def parse_one_supporters_page(self, response): hxs = HtmlXPathSelector(response) # titles = hxs.select("//span[@class='pl']") # avoid double parse here??? backer_url = re.search('[0-9]+', response.url) PROJ_ID = -1 if backer_url != None: # self.log('parse the proj_id in backer page error in %s' %response.url) #else: PROJ_ID = backer_url.group(0) backers = hxs.select( "//div[@class='projects-backers-left']/div[@class='supporters']") items = [] for backer in backers: item = Proj_Supporter() supporter_name = backer.select( ".//div[@class='supportersmeta']/div[@class='supportersmeta-t']/a[@class='supportersmeta-t-a']/text()" ).extract() supporter_id = backer.select( ".//div[@class='supportersmeta']/div[@class='supportersmeta-t']/a[@class='supportersmeta-t-a']/@href" ).extract() supporter_icon = backer.select( ".//div[@class='supportersmeta']/div[@class='supportersmeta-t']/div[@class='icon-sun-ms']/a/text()" ).extract() supporter_total_support_proj = backer.select( ".//div[@class='supportersmeta']/text()[4]").extract() supporter_support_time = backer.select( ".//div[@class='supportersmeta']/text()[2]").extract() supporter_support_amount = backer.select( ".//div[@class='supportersmeta']/text()[3]").extract() #print "supporter name", supporter_name #print "supporter url", supporter_url #print "supporter icon level ", supporter_icon #print "supporter_total_support_proj ", supporter_total_support_proj #print "supporter_support_time ", supporter_support_time #print "supporter total support", supporter_support_amount if len(supporter_name) == 1: item['supporter_name'] = supporter_name[0] if len(supporter_id) == 1: item['supporter_id'] = item.clean_supporter_id(supporter_id[0]) if len(supporter_icon) == 1: item['supporter_icon'] = item.clean_supporter_icon( supporter_icon[0]) if len(supporter_support_time) == 1: item[ 'supporter_support_time'] = item.clean_supporter_support_time( supporter_support_time[0]) if len(supporter_support_amount) == 1: item['supporter_support_amount'] = supporter_support_amount[0] if len(supporter_total_support_proj) == 1: item[ 'supporter_total_support_proj'] = item.clean_supporter_total_support_proj( supporter_total_support_proj[0]) item['supporter_proj_id'] = PROJ_ID items.append(item) for item in items: yield item # return items """
def parse_proj_info(self, response): hxs = HtmlXPathSelector(response) ################################################################################################################## # section of proj table # (proj_url, proj_id(PK), proj_name, proj_funding_target, proj_current_funding_amount, proj_current_funding_percentage, proj_status, proj_left_over_time, proj_owner_name, # proj_owner_location, proj_supporter_count, proj_surfer_count, proj_topic_count) ################################################################################################################### proj = Proj_Item() # get proj url, add prefix to get the complete url proj_url = hxs.select( "//div[@class='ui-tab']/div[@class='ui-tab-top']/h1/a/@href" ).extract() if len(proj_url) != 1: self.log("Parse the proj url error. %s" % response.url) return else: proj['proj_url'] = self.add_url_prefix(proj_url[0]) # one very important id -->Proj_Id # if len( PROJ_ID = proj_url[0].split('/') if len(PROJ_ID) != 3: self.log("Parse Proj_id error. %s" % response.url) else: PROJ_ID = PROJ_ID[len(PROJ_ID) - 1] proj['proj_id'] = PROJ_ID # get the proj name proj_title = hxs.select( "//div[@class='ui-tab']/div[@class='ui-tab-top']/h1/a/text()" ).extract() if len(proj_title) != 1: self.log("Parse the proj name error. %s" % response.url) else: proj['proj_name'] = proj_title[0] projs_sidebar_funding = hxs.select("//div[@class='sidebar-funding']") if len(projs_sidebar_funding) == 0: projs_sidebar_funding = hxs.select( "//div[@class='sidebar-warming']") if len(projs_sidebar_funding) == 0: projs_sidebar_funding = hxs.select( "//div[@class='sidebar-success']") if len(projs_sidebar_funding) == 0: projs_sidebar_funding = hxs.select( "//div[@class='sidebar-failure']") if (len(projs_sidebar_funding) != 1): self.log("Parse the proj table error. %s" % response.url) print "Parse the proj table error. %s" % response.url else: # get proj_funding_target p = projs_sidebar_funding[0] proj_funding_target = p.select( ".//div[@class='sidebar-money-raised-num-t']/b/text()" ).extract() print proj_funding_target if len(proj_funding_target) == 1: proj['proj_funding_target'] = proj.clean_proj_funding_target( proj_funding_target[0]) # get proj_current_funding_amount proj_current_funding_amount = p.select( ".//div[@class='sidebar-money-raised-num']/b/text()").extract( ) print proj_current_funding_amount if len(proj_current_funding_amount) == 1: proj[ 'proj_current_funding_amount'] = proj.clean_proj_current_funding_amount( proj_current_funding_amount[0]) # get proj_current_funding_percentage proj_current_funding_percentage = p.select( ".//span[@class='sidebar-percentage-progress-span']/text()" ).extract() print proj_current_funding_percentage if len(proj_current_funding_percentage) != 1: self.log( "Parse the proj_current_funding_percentage at url = %s" % response.url) else: percentage = re.search('[\d]+', proj_current_funding_percentage[0]) if percentage == None: self.log( "Parse the proj_current_funding_percentage at url = %s" % response.url) else: percentage = percentage.group(0) proj['proj_current_funding_percentage'] = Decimal( percentage.strip('"')) / 100 # this is how many people support this proj proj_supporter_count = p.select( ".//div[@class='sidebar-number-days-l']/b/b/text()").extract() print "support num:", proj_supporter_count if len(proj_supporter_count) == 1: proj['proj_supporter_count'] = proj_supporter_count[0] # this is how many people view this proj proj_surfer_count = p.select( ".//div[@class='sidebar-number-days-m']/b/b/text()").extract() print "people view ", proj_surfer_count if len(proj_surfer_count) == 1: proj['proj_surfer_count'] = proj_surfer_count[0] # get topic of the proj topic_count = hxs.select( "//ul[@class='ui-tab-menu']/li/a/span[@id='posts_count']/text()" ).extract() if len(topic_count) != 1: self.log("Parse topic count error. %s" % response.url) print "Parse topic count error. %s" % response.url else: proj['proj_topic_count'] = topic_count[0] # get the proj_status proj_status = p.select( ".//div[@class='sidebar-number-days-r']/span/text()").extract( ) if len(proj_status) != 1: self.log("Parse proj status error. %s" % response.url) print "Parse proj status error. %s" % response.url else: proj['proj_status'] = proj_status[0] # get how many days left proj_leftover_time = p.select( ".//div[@class='sidebar-number-days-r']/b/b/text()").extract() print "days left ", proj_leftover_time if len(proj_leftover_time) == 1: proj['proj_leftover_time'] = proj_leftover_time[0] # get the unit of left_over proj_leftover_time_units = p.select( ".//div[@class='sidebar-number-days-r']/b/text()").extract() if len(proj_leftover_time_units) == 1: proj['proj_leftover_time_unit'] = 0 # proj complete elif len(proj_leftover_time_units) == 2: proj['proj_leftover_time_unit'] = proj_leftover_time_units[1] else: self.log("Can not parse proj left over time at url=%s" % response.url) print "Parse proj left over time error. %s" % response.url # get proj_owner information projs_owner = hxs.select("//div[@class='project-by']") if len(projs_owner) != 1: self.log("Parse proj owner error. %s" % response.url) else: p = projs_owner[0] proj_owner_owner_name = p.select( ".//a[@class='project-by-img-r-author']/text()").extract() if len(proj_owner_owner_name) == 1: proj['proj_owner_name'] = proj_owner_owner_name[0] # get proj_location --> this wil be extracted in another table # reason is this information may not be available at back page, only exist in main page yield proj # end of section of proj table ################################################################################################################## ################################################################################################################## # section of section of proj_owner_table # (proj_owner_owner_id(PK), proj_owner_proj_id(PK), proj_owner_owner_name, proj_owner_star_level, proj_owner_last_log_in_time, # proj_owner_own_proj_count, proj_owner_support_proj_count ) ################################################################################################################## projs_owner = hxs.select("//div[@class='project-by']") if len(projs_owner) != 1: self.log("Parse the proj_owner error. %s" % response.url) print "Parse the proj_owner error. %s" % response.url else: p = projs_owner[0] proj_owner = Proj_Owner_Item() proj_owner_owner_id = p.select( ".//a[@class='project-by-img-r-author']/@href").extract() print "proj name url: ", proj_owner_owner_id if len(proj_owner_owner_id) != 1: self.log("Parse proj owner id from page %s error" % response.url) else: owner_id = re.search('[0-9]+$', proj_owner_owner_id[0]) if owner_id == None: self.log("Extract the proj owner id from url = %s error" % response.url) else: proj_owner['proj_owner_owner_id'] = owner_id.group(0) proj_owner['proj_owner_proj_id'] = PROJ_ID proj_owner_owner_name = p.select( ".//a[@class='project-by-img-r-author']/text()").extract() print "proj name: ", proj_owner_owner_name if len(proj_owner_owner_name) == 1: proj_owner['proj_owner_owner_name'] = proj_owner_owner_name[0] proj_owner_star_level = p.select( ".//div[@class='project-by-img-r']/div[@class='icon-sun-m']/a/text()" ).extract() print "proj proj_owner_star_level: ", proj_owner_star_level if len(proj_owner_star_level) == 1: proj_owner['proj_owner_star_level'] = proj_owner_star_level[0] proj_owner_last_log_in_time = p.select( ".//div[@class='project-by-last-time']/text()").extract() print "proj last update time,", proj_owner_last_log_in_time log_in = re.search('[\d]+/[\d]+/[\d]+', proj_owner_last_log_in_time[0]) if log_in == None: self.log( "parse proj owner proj_owner_last_log_in_time error at page %s" % response.url) else: proj_owner['proj_owner_last_log_in_time'] = log_in.group(0) proj_by_post_support_list = p.select( ".//div[@class='project-by-post']/a[@target='_blank']/span/text()" ).extract() proj_owner_support_proj_count = 0 proj_owner_own_proj_count = 0 if len(proj_by_post_support_list) >= 1: proj_owner_support_proj_count = proj_by_post_support_list[0] proj_owner[ 'proj_owner_support_proj_count'] = proj_by_post_support_list[ 0] if len(proj_by_post_support_list) >= 2: proj_owner_own_proj_count = proj_by_post_support_list[1] proj_owner[ 'proj_owner_own_proj_count'] = proj_by_post_support_list[1] print "proj owner supports:", proj_owner_support_proj_count print "proj owner owns:", proj_owner_own_proj_count yield proj_owner # end of section of proj_owner_table ################################################################################################################## ################################################################################################################## # section of donation table, we need to follow the link within the donor page (pagination) ########################################################################################## #u'/projects/318262/backers' # # >>> response.url # # 'http://www.demohour.com/projects/318262' # ########################################################################################## backers = hxs.select( "//div[@class='ui-tab-layout']/ul[@class='ui-tab-menu']/li/a/@href" ) if len(backers) == 3: # we have current tab, posts and backers tab backer_relative_urls = backers[2].extract().split('/') backer_relative_url = backer_relative_urls[ len(backer_relative_urls) - 1] backers_full_url = response.url + '/' + backer_relative_url yield Request(backers_full_url, self.parse_backers_links) for supporter in self.parse_backers_links( response): # we have supporter information here print "supporter name:", supporter['supporter_name'] print "supporter url:", supporter['supporter_url'] print "supporter icon:", supporter['supporter_icon'] print "supporter support time", supporter[ 'supporter_support_time'] print "supporter support amount", supporter[ 'supporter_support_amount'] print "supporter support total proj count", supporter[ 'supporter_total_support_proj'] supporter['supporter_proj_id'] = PROJ_ID yield supporter # end of section of donation table ################################################################################################################## # if we want to add the user information table, we will do sth similar to the back table here ################################################################################################################################### # section of Topic table # (topic_proj_id(PK), topic_total_buzz_count, topic_announcement_count, topic_question_count, topic_up_count, topic_down_count, topic_proj_category, topic_proj_location ) ################################################################################################################################### projs_topic = hxs.select("//div[@class='projects-home-left']") if len(projs_topic) == 1: #self.log("Parse the topic at the end of the page error at url = %s" %response.url) #else: proj_topic = Proj_Topic() proj_topic['topic_proj_id'] = PROJ_ID # get the topic_total_buzz_count topic_total_buzz_count = projs_topic.select( ".//li/a[@id='filter_all']/span/text()").extract() if len(topic_total_buzz_count) != 1: self.log("Parse topic_total_buzz_count error at url = %s" % response.url) else: proj_topic['topic_total_buzz_count'] = topic_total_buzz_count[ 0] topic_all_count = projs_topic.select( ".//li/a[@data-remote='true']/span/text()").extract() if len(topic_all_count) < 5: self.log("Parse other buzz count error at url = %s" % response.url) else: proj_topic['topic_announcement_count'] = topic_all_count[1] proj_topic['topic_question_count'] = topic_all_count[2] proj_topic['topic_up_count'] = topic_all_count[3] proj_topic['topic_down_count'] = topic_all_count[4] # now we will get the proj tags, e.g., category, location projs_tag = hxs.select( ".//div[@class='projects-home-left-seat']/a[@target='_blank']/text()" ).extract() if len(projs_tag) != 3: self.log("Parse proj tag error at url = %s" % response.url) return else: proj_topic['topic_proj_category'] = projs_tag[0] proj_topic['topic_proj_owner_name'] = projs_tag[1] proj_topic['topic_proj_location'] = projs_tag[2] yield proj_topic # yield item ################################################################################################################################### # section of incentive/reward table # (incentive_proj_id(PK), incentive_id(PK), incentive_expect_support_amount, incentive_current_supporter_count, incentive_total_allowable_supporter_count, # incentive_description, incentive_reward_shipping_method, incentive_reward_shipping_time) ################################################################################################################################### projs_reward_options = hxs.select("//div[@class='reward-options']/ul") rewards = [] firstIncentive = True for p in projs_reward_options: reward = Proj_Incentive_Options_Item() reward['incentive_proj_id'] = PROJ_ID # get incentive_expect_support_amount incentive_expect_support_amount = p.select( ".//li[@class='support-amount']/text()[2]").extract() print "support amount: ", incentive_expect_support_amount if len(incentive_expect_support_amount) == 1: reward[ 'incentive_expect_support_amount'] = reward.clean_expect_support_amount( incentive_expect_support_amount[0]) # if len(support_amount) == 1: # reward['incentive_expect_support_amount'] = support_amount[0] # get incentive_current_supporter_count incentive_current_supporter_count = p.select( ".//li[@class='support-amount']/span/text()").extract() print "supporter number:", incentive_current_supporter_count if len(incentive_current_supporter_count) == 1: count = reward.clean_current_supporter_count( incentive_current_supporter_count[0]) if len(count) == 1: reward['incentive_current_supporter_count'] = count[0] # get incentive_total_allowable_supporter_count, if any incentive_total_allowable_supporter_count = p.select( ".//li[@class='supporter-number']/div[@class='supporter-limit']/p/text()" ).extract() if len(incentive_total_allowable_supporter_count) == 1: quote = reward.clean_total_allowable_supporter_count( incentive_total_allowable_supporter_count[0]) if len(quote) >= 1: reward[ 'incentive_total_allowable_supporter_count'] = quote[0] # get incentive_description, incentive_description = p.select( ".//li[@class='returns-contents']/p/text()").extract() if len(incentive_description) >= 1: reward[ 'incentive_description'] = reward.clean_incentive_descriptions( incentive_description[0]) # get incentive_reward_shipping_method, if any incentive_reward_shipping_time_and_method = p.select( ".//li[@class='returns-contents-time']/p/text()").extract() if len(incentive_reward_shipping_time_and_method) == 1: shipping_time = reward.clean_reward_shipping_time( incentive_reward_shipping_time_and_method[0]) if len(shipping_time) >= 1: reward['incentive_reward_shipping_time'] = shipping_time[0] elif len(incentive_reward_shipping_time_and_method) == 2: shipping_method = incentive_reward_shipping_time_and_method[0] reward['incentive_reward_shipping_method'] = shipping_method time = reward.clean_reward_shipping_time( incentive_reward_shipping_time_and_method[1]) if len(time) >= 1: reward['incentive_reward_shipping_time'] = time[0] rewards.append(reward) ################################################################################################################################### # end of table incentive/reward ################################################################################################################################### for reward in rewards: yield reward
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) identifier = re.search('-([\d\,]+)\.html', response.url).group(1) product_name = hxs.select( '//div[@itemprop="name"]/text()')[0].extract().strip() base_price = hxs.select('//p[@itemprop="Price"]/text()')[0].extract() base_price_decimal = hxs.select( '//p[@itemprop="Price"]/span[@class="decimal"]/text()').extract() if base_price_decimal: base_price += base_price_decimal[0] image_url = hxs.select('//img[@itemprop="image"]/@src').extract() category = hxs.select('//div[@id="filCateg"]/a/text()').extract() brand = hxs.select( '//span[@itemprop="brand"]/text()')[0].extract().strip() out_of_stock = hxs.select('//img[@id="BoutonIndispo"]') models = hxs.select('//table[@class="tabModeles"]/tr[@class="tr_FA"]') for model in enumerate(models): i = str(model[0]) model = model[1] model_name = model.select( './/td[@class="ref"]/text()')[0].extract().strip() model_price = model.select( './/td[@class="prix"]/span[@class="Normal" or @class="NormalSansCoupon" or @class="Promo" or @class="TopPrix"]/text()' )[0].extract() model_price_decimal = model.select( './/td[@class="prix"]/span[@class="Normal" or @class="NormalSansCoupon" or @class="Promo" or @class="TopPrix"]/span[@class="decimal"]/text()' ).extract() if model_price_decimal: model_price += model_price_decimal[0] loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', '{}.{}'.format(identifier, i)) loader.add_value('sku', model_name) loader.add_value('url', response.url) loader.add_value('name', u'{} {}'.format(product_name, model_name)) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('brand', brand) loader.add_value('category', category[-1] if category else '') loader.add_value('price', model_price) if out_of_stock: loader.add_value('stock', 0) yield loader.load_item() if not models: loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('identifier', identifier) loader.add_value('url', response.url) loader.add_value('name', product_name) if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('brand', brand) loader.add_value('category', category[-1] if category else '') loader.add_value('price', base_price) if out_of_stock: loader.add_value('stock', 0) yield loader.load_item()
def parse_product(self, response): if response.url == 'http://www.wms.co.uk/Pulse_Oximetry/Handheld_Pulse_Oximeters/Huntleigh_Smartsigns_MiniPulse_MP1R_Rechargeable_Pulse_Oximeter?PC=W6609': text = response.body.replace('<3kg', '<3kg') hxs = HtmlXPathSelector(text=text) else: hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//*[@id="Images_Main"]/@src').extract() image_url = urljoin(base_url, image_url[0]) if image_url else '' category = hxs.select( '//*[@id="Breadcrumb_Div"]/div/a[2]/text()').extract() category = category[0] if category else '' products = hxs.select( '//*[@id="Product_Div_Outer"]//div[@class="Product_Grid_Outer"]') for product in products: try: price = product.select( './/div[@class="Product_Grid_Price"]/text()').extract( )[0].strip() price = extract_price(price) except Exception as e: self.log("Couldn't find price for product {}, error code: {}". format(response.url, e)) continue availability = product.select( './/div[@class="Product_Grid_Availability"]/text()').extract() if availability and availability[0].strip( ) == 'This product is no longer available': self.log('Product {} is no longer available'.format( response.url)) continue options = product.select( './/select[@class="Product_Grid_Variant_Select"]/option') if options: x = hxs.select( '//script[contains(text(), "SwapVariant(event, intPC)")]' ).extract()[0] options_availability_lines = x.split('\r\n') name = product.select( './div[@class="Product_Grid_Description"]/text()').extract( )[0].strip() for option in options: loader = ProductLoader(item=Product(), selector=product) identifier = option.select('./@value').extract()[0] option_name = option.select( './text()').extract()[0].strip() option_availability = '' for line in options_availability_lines: if identifier in line: if any(word in line for word in [ 'strInner = ""', 'Please contact us for availability', 'None in stock' ]): option_availability = 'out of stock' if 'This product is no longer available' in line: option_availability = 'delisted' # self.log("==============={}================".format(line)) if option_availability == 'delisted': self.log('Product {} is delisted'.format(response.url)) continue elif option_availability == 'out of stock': loader.add_value('stock', 0) loader.add_value('url', response.url) loader.add_value('name', name + ' ' + option_name) loader.add_value('image_url', image_url) loader.add_value('category', category) price_line = '' for line in response.body_as_unicode().split('\n'): if identifier.upper() in line.upper( ) and 'PRICE' in line.upper(): price_line = line option_price = re.findall("strPrice = (.*)<br", price_line) option_price = extract_price( option_price[0]) if option_price else 0 option_price = option_price if option_price else price loader.add_value('price', option_price) loader.add_value('sku', identifier) loader.add_value('identifier', identifier) if int(price) <= 100: loader.add_value('shipping_cost', 5.33) yield loader.load_item() else: loader = ProductLoader(item=Product(), selector=product) availability = product.select( './/div[@class="Product_Grid_Availability"]/span/text()' ).extract() if availability: availability = availability[0].strip() if 'None in stock' in availability or 'Please contact us for availability' in availability: loader.add_value('stock', 0) loader.add_value('url', response.url) name = product.select( './div[@class="Product_Grid_Description"]/text()').extract( ) if not name: name = hxs.select("//h1/text()").extract() name = name.pop().strip() sku = product.select( './div[@class="Product_Grid_Code_Availability_Outer"]//strong/text()' ).extract() if not sku: sku = product.select( './/div[@class="Product_Grid_Code_Availability_Outer"]//text()' ).re(".* (.*)") sku = sku.pop().strip() loader.add_value('name', name) loader.add_value('image_url', image_url) loader.add_value('category', category) loader.add_value('price', price) loader.add_value('sku', sku.strip()) identifier = product.select( './/input[contains(@name,"PC")]/@value').extract()[0] if sku.strip() != identifier.strip(): loader.add_value('identifier', identifier.strip() + '-' + sku.strip()) else: loader.add_value('identifier', identifier.strip()) if int(price) <= 100: loader.add_value('shipping_cost', 5.33) yield loader.load_item() other_products = hxs.select( '//div[@id="You_May_Need"]/div[@class="Product_Page_Accessories_Row"]' ) for product in other_products: loader = ProductLoader(item=Product(), selector=product) loader.add_value('url', response.url) name = product.select( 'div[contains(@class, "_Description")]/text()').extract() if not name: self.log('Name not found for product {}'.format(response.url)) continue loader.add_value('name', name[0]) loader.add_value('category', category) price = product.select( 'div[contains(@class, "_Price")]/strong/font/text()').extract( )[0].strip() price = extract_price(price) loader.add_value('price', price) sku = product.select( 'div[contains(@class, "_Code")]/div/strong/text()').extract( )[0].strip() loader.add_value('sku', sku) loader.add_value('identifier', sku) if int(price) <= 100: loader.add_value('shipping_cost', 5.33) yield loader.load_item()
def parse_person_detail(self, response): hxs = HtmlXPathSelector(response) i = response.meta.get('person') or Person() i['long_description'] = "\n".join(hxs.select('//h1[@class="h1_first"]/following-sibling::p/text()').extract()) i['presence'] = ','.join(hxs.select('//h2[text()[normalize-space(.)="Web Presences"]]/following-sibling::div[position()=1]//text()').extract()).strip() yield i
def parse(self, response): hxs = HtmlXPathSelector(response) for url in hxs.select('//ul[@class="col1_alpha_nav"]/li/a/@href').extract(): yield Request(self.url_base + url, callback=self.parse_company_list)
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) loader = ProductLoader(item=Product(), response=response) brand = hxs.select('//span[@itemprop="brand"]/span/text()').extract() brand = brand[0] if brand else '' product_name = hxs.select('//h1[@itemprop="name"]/text()').extract() product_name = product_name[0].strip() product_price = response.xpath( '//meta[@itemprop="price"]/@content').extract_first() product_price = extract_price(product_price) product_code = hxs.select( '//div[@class="product-name"]/meta[@itemprop="sku"]/@content' ).extract()[0] image_url = hxs.select( '//div[@class="product-img-box"]/div/a/img/@src').extract() if not image_url: image_url = hxs.select( '//div[@id="imageShowcase"]/img/@src').extract() image_url = image_url[0] if image_url else '' categories = hxs.select( '//div[@class="breadcrumbs"]/ul/li/a/text()').extract() loader = ProductLoader(item=Product(), response=response) loader.add_value('name', product_name) loader.add_value('url', response.url) loader.add_value('sku', product_code) loader.add_value('identifier', product_code) loader.add_value('brand', brand) loader.add_value('shipping_cost', '4.99') if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url)) for category in categories: if category.upper() != 'BRANDS': loader.add_value('category', category) loader.add_value('price', product_price) out_of_stock = hxs.select('//p[@class="availability out-of-stock"]') if out_of_stock: loader.add_value('stock', 0) item = loader.load_item() options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: option_item = deepcopy(item) product_data = json.loads(options_config.groups()[0]) products = {} prices = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) prices[product] = prices.get( product, 0) + extract_price(option['price']) options_containers = hxs.select( '//select[contains(@name, "options[")]') extra_options = [] if len(options_containers) > 1: combined_options = [] for options_container in options_containers: element_options = [] for option in options_containers.select( 'option[@value!=""]'): option_id = option.select('@value').extract()[0] option_name = option.select('text()').extract()[0] option_price = option.select('@price').extract()[0] option_attr = (option_id, option_name, option_price) element_options.append(option_attr) combined_options.append(element_options) combined_options = list( itertools.product(*combined_options)) for combined_option in combined_options: final_option = {} for option in combined_option: final_option['desc'] = final_option.get( 'desc', '') + ' ' + option[1] final_option['identifier'] = final_option.get( 'identifier', '') + '-' + option[0] final_option['price'] = final_option.get( 'price', 0) + extract_price(option[2]) extra_options.append(final_option) else: for option in options_containers.select('option[@value!=""]'): final_option = {} final_option['desc'] = ' ' + option.select( 'text()').extract()[0] final_option['identifier'] = '-' + option.select( '@value').extract()[0] final_option['price'] = extract_price( option.select('@price').extract()[0]) extra_options.append(final_option) product_price = extract_price(product_data['basePrice']) for option_identifier, option_name in products.iteritems(): option_item[ 'identifier'] = product_code + '-' + option_identifier option_item['name'] = product_name + option_name option_item[ 'price'] = product_price + prices[option_identifier] option_item['sku'] = option_item['identifier'] if extra_options: for extra_option in extra_options: extra_opt_item = deepcopy(option_item) extra_opt_item['identifier'] = extra_opt_item[ 'identifier'] + extra_option['identifier'] extra_opt_item['name'] = extra_opt_item[ 'name'] + extra_option['desc'] extra_opt_item['price'] = extra_opt_item[ 'price'] + extra_option['price'] extra_opt_item['sku'] = option_item['identifier'] yield extra_opt_item else: yield option_item else: yield item
def parse(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select( '//ul[@id="menu"]//li[@class="level1"]/a/@href').extract() for category_url in categories: yield Request(category_url, callback=self.parse_category)
def parse(self, response): hxs = HtmlXPathSelector(response) urls = hxs.select('//*[@id="catNav"]/li/a/@href').extract() for url in urls: yield Request(url, callback=self.parse_categories)
def parse_product(self, response): hxs = HtmlXPathSelector(response) category = hxs.select('//div[@id="bCrumb"]/span/a/text()').extract() category = category[-1] if category else response.meta.get( 'category', '') colours = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@value!="0"]/@value' ).extract() no_option_selected = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@value="0" and @selected]/@value' ) if colours and no_option_selected: for colour in colours: formdata = {} inputs = hxs.select('//form[@id="frmMain"]//input') for input in inputs: name = ''.join(input.select('@name').extract()) value = ''.join(input.select('@value').extract()) formdata[name] = value formdata['ctl00$cphMain$ddlColour'] = colour form_url = hxs.select( '//form[@id="frmMain"]/@action').extract()[0] yield FormRequest(form_url, dont_filter=True, method='POST', formdata=formdata, callback=self.parse_product, meta={'category': category}) return loader = ProductLoader(item=Product(), selector=hxs) identifier = hxs.select('//div[@class="code"]/text()').extract()[0] loader.add_xpath('sku', '//div[@class="code"]/text()') loader.add_value('url', response.url) product_name = hxs.select( '//div[@class="title"]//h1/text()').extract()[0] loader.add_value('category', category) img = hxs.select('//img[@id="cphMain_imgThumb"]/@src').extract() if img: loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0])) loader.add_xpath('brand', '//span[@class="brand"]/text()') loader.add_value('stock', '1') if loader.get_output_value('price') < 50.00: loader.add_value('shipping_cost', '4.95') else: loader.add_value('shipping_cost', '0') price = hxs.select('//span[@class="price"]/text()').extract() if colours: colour = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@selected]/text()' ).extract()[0] colour_id = hxs.select( '//select[@id="cphMain_ddlColour"]/option[@selected]/@value' ).extract()[0] loader.add_value('identifier', identifier + '-' + colour_id) loader.add_value( 'name', product_name + ' - ' + colour.split(u' - \xa3')[0].strip()) option_price = re.search(r"\xa3(\d+.\d+)", colour) if option_price: loader.add_value('price', option_price.group(1)) else: loader.add_value('price', price) colour = colour.split(u' - \xa3')[0].strip() else: colour = hxs.select( '//span[@id="cphMain_lblSelectedColour"]/b/text()').extract() if colour: product_name = product_name + ' - ' + colour[0].strip() colour = ''.join(colour) loader.add_value('identifier', identifier) loader.add_value('name', product_name) loader.add_value('price', price) image_id = hxs.select('//img[@alt="' + colour.strip().upper() + '"]/@src').re(r'Products/(\d+)-') if not image_id: image_id = hxs.select('//img[@alt="' + colour.strip() + '"]/@src').re(r'Products/(\d+)-') prod_id = re.search(r'ProdId=(.*)&', response.url) if prod_id and image_id: image_id = image_id[0] prod_id = prod_id.group(1) product = loader.load_item() image_page = 'http://www.gooutdoors.co.uk/ZoomProductImages.aspx?ProductId=%s&ProductImageId=%s' % ( prod_id, image_id) yield Request(image_page, callback=self.parse_image, meta={'product': product}) else: yield loader.load_item()
def parse_products(self, response): hxs = HtmlXPathSelector(response) items = [] item = AmazonItem() item['title'] = hxs.select( '//div[@class="a-section a-spacing-none"]/h1/span[@id="productTitle"]/text()' ).extract() item['brand'] = hxs.select('//a[@id="brand"]/text()').extract() item['specs'] = hxs.select( '//div[@class="pdTab"][1]//node()').extract() item['offerprice'] = hxs.select( '//span[@id="priceblock_ourprice"]/text()').extract() item['saleprice'] = hxs.select( '//span[@id="priceblock_saleprice"]/text()').extract() item['description'] = hxs.select( '//div[@id="productDescription"]//text()').extract() item['feature'] = hxs.select( '//ul[@class="a-vertical a-spacing-none"]/li/span/text()').extract( ) item['image'] = hxs.select( '//span[@class="a-button-text"]/img/@src').extract() item['link'] = response.meta["url"] item['seller'] = hxs.select( '//div[@id="merchant-info"]/a[1]/text()').extract() item['sellrating'] = hxs.select( '//div[@id="merchant-info"]/text()').extract() item['starating'] = hxs.select( '//a[@class="a-link-normal"]/i/span/text()').extract()[0] item['COD'] = "Available" item['category'] = "Mobiles & Tablets" item['subcategory'] = "Wearable Devices" items.append(item) return items
def parse_product(self, response): if not isinstance(response, HtmlResponse): return hxs = HtmlXPathSelector(response) name = hxs.select( u'//div[@class="datac2"]//h1[@class="mpv_desc"]/text()').extract( )[0].strip() multiple_options = hxs.select( u'//select[@class="mpv_itemalst"]//option') if multiple_options and not u'requested' in response.meta: for option in multiple_options: formname = u'aspNetForm' formdata = { u'ctl00$MainContent$ItemAList': option.select(u'./@value').extract()[0], u'__EVENTTARGET': u'ctl00$MainContent$ItemAList', u'__EVENTARGUMENT': u'' } req = FormRequest.from_response(response, formname=formname, formdata=formdata, meta={u'requested': True}, dont_click=True, callback=self.parse_product) yield req if multiple_options: name += u' %s' % multiple_options.select( u'../option[@selected]/text()').extract()[0].strip() loader = ProductLoader(item=Product(), response=response) product_id = hxs.select( '//*[@id="ctl00_MainContent_lblLinecode"]/text()').re(r'(\d+)') if product_id: loader.add_value('identifier', product_id[0]) else: self.log('ERROR: Identifier not found!') product_sku = hxs.select( '//*[@id="ctl00_MainContent_lblProductCode"]/text()').re(r'(\d+)') if product_sku: loader.add_value('sku', product_sku[0]) else: self.log('ERROR: SKU not found!') product_image = hxs.select('//*[@id="zoom1"]/@href').extract() if product_image: url = urljoin_rfc(get_base_url(response), product_image[0]) loader.add_value('image_url', url) product_category = hxs.select( '//*[@id="papertrail"]/ul/li[1]/a/text()').extract() if product_category: loader.add_value('category', product_category[0]) loader.add_value('url', response.url) loader.add_value('name', name) loader.add_xpath( 'price', u'//div[@class="datac2"]//span[@class="offerprc"]/text()') if not loader.get_output_value('price'): loader.add_xpath('price', u'//span[@class="mpv_prc"]/text()') if loader.get_output_value('price'): yield loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) sub_items = hxs.select( '//div[@class="item-details"]//h3/a/@href').extract() if sub_items: for sub_item in sub_items: url = urljoin(response.url, sub_item) yield Request(url, callback=self.parse_product) return option_links = hxs.select( '//form[@id="save-product-to-cart"]//div/ul[contains(@class, "selection-grid")]/li/a/@href' ).extract() if not response.meta.get('option', False) and option_links: for link in option_links: url = urljoin(response.url, link) yield Request(url, meta={'option': True}, dont_filter=True, callback=self.parse_product) return loader = ProductLoader(item=Product(), response=response) loader.add_value('url', response.url) #== Extracting Identifier and SKU ==# tmp = hxs.select('//div[@id="prod-product-code"]/p/text()').extract() if not tmp: tmp = hxs.select( '//div[@id="bundle-product-code"]/p/text()').extract() if tmp: loader.add_value('identifier', tmp[0]) loader.add_value('sku', tmp[0]) #== Extracting Product Name ==# try: name = hxs.select( '//h1[@id="prod-title"]/span/text()').extract()[0].strip() except: try: name = hxs.select( "//div[@class='mod mod-product-info']/h2/text()").extract( )[0].strip() except: name = hxs.select('//h1[@id="prod-title"]/text()').extract() if name: name = name[0].strip() else: name = hxs.select( '//h1/span[@itemprop="name"]/text()').extract() if name: name = name[0].strip() else: log.msg('### No name at ' + response.url, level=log.INFO) tmp = hxs.select('//div[@class="detail-pair"]/p/text()').extract() if tmp: name += ', ' + tmp[0] loader.add_value('name', name) #== Extracting Price, Stock & Shipping cost ==# price = 0 tmp = hxs.select( '//div[@class="basket-fields"]/meta[@itemprop="price"]/@content' ).extract() if not tmp: tmp = hxs.select( '//section[div[@id="prod-product-code"]]//div[@id="prod-price"]/p//strong//text()' ).extract() if not tmp: tmp = hxs.select( '//div[@id="prod-price"]//span[@itemprop="price"]/text()' ).extract() if not tmp: tmp = hxs.select( '//strong[@class="price"]/text()').extract() if tmp: price = extract_price(''.join(tmp).strip().replace(',', '')) loader.add_value('price', price) try: loader.add_xpath('stock', '//div[@data-jl-stock]/@data-jl-stock') except ValueError: loader.add_value('stock', '0') #== Extracting Image URL ==# tmp = hxs.select('//li[contains(@class,"image")]//img/@src').extract() if tmp: url = urljoin(response.url, tmp[0]) loader.add_value('image_url', url) #== Extracting Brand ==# tmp = hxs.select('//div[@itemprop="brand"]/span/text()').extract() if tmp: loader.add_value('brand', tmp[0].strip()) #== Extracting Category ==# tmp = hxs.select('//div[@id="breadcrumbs"]/ol/li/a/text()').extract() if len(tmp) > 1: loader.add_value('category', ' > '.join(tmp[-3:])) product = loader.load_item() #== Extracting Options ==# options = hxs.select( '//div[@id="prod-multi-product-types"]//div[@itemprop="offers"]') if not options: if not product.get('identifier', None): log.msg('### No product ID at ' + response.url, level=log.INFO) else: if not product['identifier'] in self.id_seen: self.id_seen.append(product['identifier']) yield product else: log.msg('### Duplicate product ID at ' + response.url, level=log.INFO) return #== Process options ==# for sel in options: item = copy.deepcopy(product) tmp = sel.select( './/div[contains(@class,"mod-product-code")]/p/text()' ).extract() if tmp: item['identifier'] = tmp[0] item['sku'] = tmp[0] tmp = sel.select('.//h3/text()').extract() if tmp: item['name'] = name + ' - ' + tmp[0] price = 0 tmp = sel.select('.//p[@class="price"]/strong/text()').re( '[0-9,.]+') if not tmp: tmp = sel.select('.//strong[@class="price"]/text()').re( '[0-9,.]+') if tmp: price = extract_price(tmp[0].strip().replace(',', '')) item['price'] = price tmp = sel.select( './/link[@itemprop="availability"]/@content').extract() if tmp and 'in' in tmp[0].lower(): item['stock'] = 1 else: item['stock'] = 0 if not item.get('identifier', None): log.msg('### No product ID at ' + response.url, level=log.INFO) else: if not item['identifier'] in self.id_seen: self.id_seen.append(item['identifier']) yield item else: log.msg('### Duplicate product ID at ' + response.url, level=log.INFO)
def parse_product(self, response): hxs = HtmlXPathSelector(response) loader = ProductLoader(item=Product(), response=response) name = response.css('h1.fn::text').re('\S+') loader.add_value('name', name) loader.add_value('url', response.url) loader.add_value('brand', response.meta.get('brand', '')) categories = hxs.select( '//a[@class="bb-crumb__link"]/text()').extract()[3:] if not categories: categories = hxs.select( '//div[@id="breadcrumbs"]//a/text()').extract()[3:] loader.add_value('category', categories) image_url = hxs.select( '//img[@class="Product__img img-responsive"]/@src').extract() if not image_url: image_url = hxs.select('//img[@class="photo"]/@src').extract() if image_url: loader.add_value('image_url', image_url[0]) item = loader.load_item() options = response.xpath( '//div[@class="SkuList"]/div[contains(@class,"SkuGroup")]') if not options: options = response.xpath( '//div[@id="right-content-prod"]/table[contains(@class, "flavor-table flava-flav")]' ) if options: for option in options: option_item = deepcopy(item) option_name = option.select( './/span[@class="SkuGroup__heading__name"]/text()' ).extract() if not option_name: option_name = option.select( './tr/td/span/text()').extract() option_name = option_name[0].strip() option_item['name'] += ' ' + option_name price = ''.join( option.select( './/span[@class="SkuGroup__sale-price"]/text()'). extract()).strip() if not price: price = ''.join( option.select( './tr/td[contains(@class, "size-price")]//span[@class="price"]/text()' ).extract()).strip() option_item['price'] = extract_price(price) sub_options = option.select('.//tr[td[@class="availability"]]') if not sub_options: sub_options = option.select( './/tr[@class="SkuGroup__sku"]') if sub_options: for sub_option in sub_options: sub_item = deepcopy(option_item) identifier = sub_option.select( './/meta[@itemprop="sku"]/@content').extract() if not identifier: identifier = sub_option.select( './/form/input[contains(@name, "catalogRefIds") and @value!=" "]/@value' ).extract() sub_item['identifier'] = identifier[0] name = sub_option.select( './/td[@class="SkuGroup__sku__flavor"]/text()' ).extract() if not name: name = sub_option.select( './/td/h5/text()').extract() if name: sub_item['name'] += ' ' + name[0].strip() price = ''.join( sub_option.select( './tr/td[contains(@class, "size-price")]/span/span[@class="price"]/text()' ).extract()) if not price: price = ''.join( sub_option.select( './td[contains(@class, "size-price")]/span[@class="price"]/text()' ).extract()) if price: sub_item['price'] = extract_price(price.strip()) in_stock = sub_option.select( './/td[@class="SkuGroup__sku__availability" and contains(text(), "In Stock")]' ).extract() if not in_stock: in_stock = sub_option.select( './/td[@class="availability" and contains(text(), "In Stock")]' ).extract() if not in_stock: sub_item['stock'] = 0 yield sub_item
def parse(self, response): hxs = HtmlXPathSelector(response) url = hxs.select('//td[contains(@align,"left")]').select( 'a/@href').extract() print url
def parse_product(response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select('//meta[@itemprop="image"]/@content').extract() product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0] product_name = hxs.select('//div[@class="product-name"]//h1/div/text()' ).extract()[0].strip() price = hxs.select('//meta[@itemprop="price"]/@content').extract() price = extract_price(price[0]) category = hxs.select( '//meta[@itemprop="category"]/@content').extract()[0].split('>') if not ''.join(category): category = hxs.select( '//div[@class="breadcrumbs"]/ul/li/a/text()').extract()[2:] brand = hxs.select('//meta[@itemprop="brand"]/@content').extract() sku = hxs.select('//meta[@itemprop="sku"]/@content').extract() stock = hxs.select('//p[@class="availability out-of-stock"]').extract() product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) product_loader.add_value('sku', sku) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('category', category) product_loader.add_value('brand', brand) if stock: product_loader.add_value('stock', 0) product = product_loader.load_item() yield product
def parse_product(self, response): hxs = HtmlXPathSelector(response) url = response.url brand = response.meta.get('brand', '') l = ProductLoader(item=Product(), response=response) name = hxs.select("//div[@id='pdpProduct']/h1/text()").extract() if not name: self.log("ERROR! NO NAME! %s" % url) log.msg('ERROR! NO NAME!') return name = name[0].strip() if brand.lower() == 'lifetime' and name.lower().find('lifetime') == -1: return price = hxs.select( "//div[@id='pdpPricing']/span[@class='actualprice']/span/text()" ).extract() if not price: self.log("ERROR! NO PRICE! %s %s" % (url, name)) return price = "".join(price) sku = hxs.select( "//span[@class='identifier']/span[contains(@class, 'partnumber')]/text()" ).extract() if not sku: self.log("ERROR! SKU! %s %s" % (url, name)) # return else: l.add_value('sku', sku[0]) category = '' s = hxs.select( "//script[contains(text(),'EFFECTIVE_URL')]/text()").extract() if s: s = s[0].strip() pos = s.find('category_root') if pos != -1: s = s[pos:].split('|') if len(s) > 1: category = s[1].replace('+', ' ') l.add_value('category', category) if category == '': self.log("ERROR! NO Category found! %s %s" % (url, name)) product_image = hxs.select('//*[@id="mainimage"]/@src').extract() if not product_image: self.log('ERROR: no product Image found!') else: image = urljoin_rfc(get_base_url(response), product_image[0].strip()) l.add_value('image_url', image) l.add_value('name', name) l.add_value('url', url) l.add_value('price', price) l.add_value('brand', brand.strip().lower()) l.add_xpath('identifier', u'//form/input[@name="productId"]/@value') product = l.load_item() metadata = KeterMeta() metadata['brand'] = brand.strip().lower() metadata['reviews'] = [] product['metadata'] = metadata reviews_url = 'http://argos.ugc.bazaarvoice.com/1493-en_gb/%s/reviews.djs?format=embeddedhtml' # part_number = hxs.select(u'//form/input[@name="partNumber"]/@value').extract()[0] part_number = re.search(r'/partNumber/(\d+)', response.url).group(1) yield Request(reviews_url % part_number, callback=self.parse_review_page, meta={'product': product})
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) shipping_cost = hxs.select( './/a[contains(text(), "Delivery Surcharge")]//../..//td[2]//span/text()' ).extract() if not shipping_cost: shipping_cost = hxs.select( './/td[contains(text(), "Shipping Surcharge")]//..//td[2]//span/text()' ).extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="ProductNameH1"]/text()') loader.add_value( 'category', hxs.select('//div[@class="breadcrum"]/div/a/text()').extract()[-1]) loader.add_xpath( 'identifier', '//form//input[@id="hdnProdId" or @name="hdnProdId"]/@value') price = hxs.select( './/td[contains(text(), "Price:")]//..//td[2]//span/text()' ).extract() if price: loader.add_value('price', price[0]) else: loader.add_value('price', 0) try: loader.add_value('shipping_cost', shipping_cost[0].strip()) except: pass item = hxs.select('//td/strong') if item and item[0].select('../text()'): loader.add_value( 'sku', item[0].select('../text()').extract()[1].strip('#() ')) image_url = hxs.select( '//div[@id="divImageBlock"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('brand', 'Rubbermaid') product = loader.load_item() product['sku'] = product['sku'].upper() metadata = KeterMeta() metadata['brand'] = 'Rubbermaid' metadata['reviews'] = [] product['metadata'] = metadata self.log('>> BROWSER => GET < %s />' % response.url) self._browser.get(response.url) self.log('>> OK') self.log('>> BROWSER => Looking for more reviews ...') try: load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages = 25 while more_reviews and max_pages: self.log('>> More reviews found...') load_more_button.click() self.log('>> BROWSER => CLICK "Load more"') time.sleep(20) self.log('>> OK') load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages -= 1 self.log('>> No more reviews...') except Exception, e: self.log('>> ERROR FOUND => %s' % e)
def browse_and_parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) for subcat_href in hxs.select( '//div[@id="navColumnOne"]//a/@href').extract(): subsubcat_url = urlparse.urljoin(base_url, subcat_href) if subsubcat_url not in self.navig_url_set: self.navig_url_set.add(subsubcat_url) yield Request(subsubcat_url, callback=self.browse_and_parse) pages = hxs.select( '//div[@id="newProductsDefaultListingTopLinks"]//a/@href').extract( ) for url in pages: yield Request(url, callback=self.browse_and_parse) # parse product listing in this page, if any for product in hxs.select( '//table[@class="table-product-attributes"]'): product_loader = ProductLoader(item=Product(), response=response) url = product.select('.//td[@class="main"]/a/@href').extract()[0] product_loader.add_value( 'identifier', re.search(r'products_id=(\d+)', url).groups()[0]) product_loader.add_value('url', url) product_loader.add_value( 'name', product.select( './/td[@class="main"]/a/strong/text()').extract()[0]) try: price = product.select('.//span[@class="table-price"]/text()')\ .extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.') except: price = product.select('.//span[@class="productSpecialPrice"]/text()')\ .extract()[0].split("-")[0].split(" ")[1].replace('.', '').replace(',', '.') product_loader.add_value('price', price) yield product_loader.load_item() # edge case: product listing page with a single product product_price = hxs.select( '//h2[@id="productPrices"]/text()').extract() if product_price: # this product listing page contains a single product product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', '//h1[@id="productName"]/text()') product_loader.add_value('url', response.url) product_loader.add_value( 'identifier', re.search(r'products_id=(\d+)', response.url).groups()[0]) try: product_loader.add_value('price', product_price[0].split("-")[0]\ .split(" ")[1].replace('.', '').replace(',', '.')) except: product_loader.add_value('price', hxs.select('//span[@class="productSpecialPrice"]/text()').extract()[0]\ .split("-")[0].split(" ")[1].replace('.', '').replace(',', '.')) yield product_loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) image_url = hxs.select( '//div[@id="product-image-container"]//img[1]/@src').extract() if not image_url: image_url = hxs.select( '//img[@id="product-main-image"]/@src').extract() try: product_identifier = hxs.select( '//input[@name="product"]/@value').extract()[0].strip() except: product_identifier = hxs.select( '//form[@id="product_addtocart_form"]/@action').re( r'/product/(\d+)')[0] product_name = hxs.select( 'normalize-space(//h1[@class="product-title"]/text())').extract( )[0] category = hxs.select( '//nav[@id="breadcrumd_abbotandknight"]//li/a/text()').extract() category = category[-1].strip() if category else '' brand = '' promotion = False feature_names = hxs.select( '//*[@id="product-attribute-specs"]//td[@class="feature-title"]/text()' ).extract() feature_values = hxs.select( '//*[@id="product-attribute-specs"]//td[@class="feature-description"]/text()' ).extract() for name, value in zip(feature_names, feature_values): if name.strip() == 'Brand:': brand = value.strip() elif name.strip() == 'Promotions:' and value.strip() == 'On Sale': promotion = True options_config = re.search( r'var spConfig = new Product.Config\((.*)\)', response.body) if options_config: product_data = json.loads(options_config.groups()[0]) products = {} for attr in product_data['attributes'].itervalues(): for option in attr['options']: for product in option['products']: products[product] = ' - '.join( (products.get(product, ''), option['label'])) for identifier, option_name in products.iteritems(): product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier + '_' + identifier) product_loader.add_value('name', product_name + option_name) if image_url: product_loader.add_value( 'image_url', urljoin_rfc(base_url, image_url[0])) price = float(product_data['childProducts'][identifier] ['finalPrice']) * 1.2 product_loader.add_value('price', round(price, 2)) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() if promotion: metadata = ColourBankMeta() metadata['sold_as'] = 'Promotion' product['metadata'] = metadata yield product else: product_loader = ProductLoader(item=Product(), selector=hxs) product_loader.add_value('identifier', product_identifier) product_loader.add_value('name', product_name) if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = hxs.select( '//*[@id="product-price-{}"]/span/text()'.format( product_identifier)).extract() if not price: price = hxs.select('//*[@id="product-price-{}"]/text()'.format( product_identifier)).extract() if price and price[0].strip() == '': price = hxs.select( '//*[@id="old-price-{}"]/span/text()'.format( product_identifier)).extract() price = extract_price(price[0].strip()) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() if promotion: metadata = ColourBankMeta() metadata['sold_as'] = 'Promotion' product['metadata'] = metadata yield product # Related categories for url in hxs.select( '//div[@id="product-related"]//a/@href').extract(): yield Request( add_or_replace_parameter(urljoin_rfc(base_url, url), 'limit', 'all'), self.parse_categories_products)
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} name = hxs.select("//div[@class='product-name']/h1/text()").extract() url = response.url price = "".join( hxs.select( "//div[@class='col-right']/div/div[@class='price-block']/span/span[@class='price']/text()" ).re(r'([0-9\,\. ]+)')).strip() if not price: price = "".join( hxs.select( "//div[@class='col-right']/div/p[@class='special-price']/span[@class='price']/text()" ).re(r'([0-9\,\. ]+)')).strip() if not price: price = hxs.select('//*[@itemprop="price"]//text()').re( r'([\d.,]+)') if not price: try: price_popup_hxs = HtmlXPathSelector( text=re.search(r'realPrice = (.*)', response.body).groups( )[0].replace('\\n', '').replace('\\t', '').replace( '\\', '')[1:-2].strip()) price = price_popup_hxs.select( '//span[@class="price"]/text()').extract() except: pass try: sku = hxs.select("//dd[@class='identifier']/text()")[0].extract() except: sku = '' res['url'] = urljoin_rfc(base_url, url) res['description'] = sku + ' ' + name[0].strip() res['image_url'] = hxs.select( '//a[@id="image-link"]/img/@src').extract() category = hxs.select('//div[@class="breadcrumbs"]//a/span/text()') if category: res['category'] = category[-1].extract() res['brand'] = hxs.select('//dd[@class="brand"]/text()').extract() # res['sku'] = sku res['identifier'] = sku sku2 = hxs.select("//div[@class='1']/text()").extract() if not sku2: sku2_ = 0 else: sku2_ = sku2[0] sku3 = hxs.select("//div[@class='2']/text()").extract() if not sku3: sku3_ = 0 else: sku3_ = sku3[0] model = hxs.select("//dd[@class='model']/text()").extract() if not model: self.log('NO MODEL/SKU => %s' % (res['url'], )) model_ = '' else: model_ = model[0] res['sku'] = model_ # Using model field as SKU self.csv_writer.writerow( [res['sku'], sku2_, sku3_, model_, name[0].strip()]) options_select = hxs.select( '//div[@id="product-options-wrapper"]//select') options_radio = hxs.select( '//div[@id="product-options-wrapper"]//ul[@class="options-list"]') if options_select: form_action = hxs.select( '//form[@id="product_addtocart_form"]/@action').extract()[0] params = dict( zip( hxs.select( '//form[@id="product_addtocart_form"]//input/@name'). extract(), hxs.select( '//form[@id="product_addtocart_form"]//input/@value'). extract())) product_data = json.loads( re.search(r'var spConfig = new Product.Config\((.*)\)', response.body).groups()[0]) for product in product_data['attributes'].values(): attr = product['id'] super_attr_param = u'super_attribute[%s]' % attr option_params = [] for option in product['options']: opt_params = params.copy() opt_params[super_attr_param] = option['id'] option_params.append(opt_params) opt_params = option_params.pop() yield FormRequest(form_action, formdata=opt_params, callback=self.parse_cart, meta={ 'item': res, 'params': option_params, 'form_action': form_action, 'cookiejar': response.meta['cookiejar'] }, dont_filter=True) elif options_radio: form_action = hxs.select( '//form[@id="product_addtocart_form"]/@action').extract()[0] params = dict( zip( hxs.select( '//form[@id="product_addtocart_form"]//input[not(@type="radio") and not(@disabled)]/@name' ).extract(), hxs.select( '//form[@id="product_addtocart_form"]//input[not(@type="radio") and not(@disabled)]/@value' ).extract())) options = zip( hxs.select( '//form[@id="product_addtocart_form"]//input[@type="radio" and not(@disabled)]/@name' ).extract()[1:], hxs.select( '//form[@id="product_addtocart_form"]//input[@type="radio" and not(@disabled)]/@value' ).extract()[1:]) option_params = [] for option in options: opt_params = params.copy() opt_params.update({option[0]: option[1]}) option_params.append(opt_params) opt_params = option_params.pop() yield FormRequest(form_action, formdata=opt_params, callback=self.parse_cart, meta={ 'item': res, 'params': option_params, 'form_action': form_action, 'cookiejar': response.meta['cookiejar'] }, dont_filter=True) elif price: res['price'] = price yield load_product(res, response) else: form_action = hxs.select( '//form[@id="product_addtocart_form"]/@action').extract()[0] params = dict( zip( hxs.select( '//form[@id="product_addtocart_form"]//input/@name'). extract(), hxs.select( '//form[@id="product_addtocart_form"]//input/@value'). extract())) yield FormRequest(form_action, formdata=params, callback=self.parse_cart, meta={ 'item': res, 'cookiejar': response.meta['cookiejar'] }, dont_filter=True)
def parse_item(self, response): hxs = HtmlXPathSelector(response) # Ensure the search matched brand, not some part of name or description brand = hxs.select( u'//div/div/p/b[contains(text(),"Brand")]/../../../div[2]/p/text()' ).extract() brand = brand and brand[0].strip().lower() # XXX No brand field for some suncast products, but they have brand in name if not brand: logging.warning('Brand not found [%s]' % response.url) brand = '' name = hxs.select(u'//h1/text()').extract()[0].strip() if response.meta['brand'].lower() in name.lower(): logging.warning('Assume [%s] from name' % response.meta['brand']) brand = response.meta['brand'].lower() if 'keter' in brand.lower(): brand = 'keter' if response.meta['brand'].lower() != brand: logging.warning( 'Brand [%s] not equal to search result brand [%s] [%s]' % (response.meta['brand'], brand, response.url)) return product_loader = ProductLoader(item=Product(), response=response) product_loader.add_xpath('name', u'//h1/text()') sku = hxs.select(u'//meta[@property="eb:id"]/@content').extract()[0] product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) price = hxs.select('//span[@class="ppPrice"]/text()').extract()[0] price += hxs.select( '//span[@class="ppPrice"]/span/text()').extract()[0] product_loader.add_value('price', price) product_loader.add_value('brand', brand.lower()) product_loader.add_xpath('image_url', '//*[@id="jqzoom"]/@href') product_loader.add_value('url', response.url) product = product_loader.load_item() metadata = KeterMeta() metadata['brand'] = brand metadata['reviews'] = [] product['metadata'] = metadata response.meta['product'] = product n_reviews = hxs.select( u'//div[@class="prSnippetReadReviews"]/a/text()').extract() if n_reviews: n_reviews = int(n_reviews[0].split()[1]) review_sku = hxs.select( u'//div[@id="HN_PP"]/@ppskunum').extract()[0] # 5 reviews per page pages = n_reviews / 5 if n_reviews % 5 > 0: pages += 1 response.meta['review_sku'] = review_sku response.meta['review_pages'] = pages response.meta['review_n'] = 1 yield Request(review_url(response.meta['review_sku'], response.meta['review_n']), meta=response.meta, callback=self.parse_review) else: yield product
class RubbermaidSpider(BaseSpider): name = 'keter-rubbermaid.com' allowed_domains = ['rubbermaid.com'] start_urls = [ 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=shed-accessories', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=VerticalSheds', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=HorizontalSheds', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=Outdoor&SubCatId=DeckBoxesPatioBenches', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=ResinCabinets', 'http://www.rubbermaid.com/category/pages/subcategorylanding.aspx?CatName=GarageOrganization&SubCatId=FastTrackGarageOrganizationSystem' ] def __init__(self, *args, **kwargs): super(RubbermaidSpider, self).__init__(*args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed) self._browser = PhantomJS.create_browser() max_wait = 60 self._browser.set_page_load_timeout(max_wait) self._browser.set_script_timeout(max_wait) def spider_closed(self): self._browser.quit() def parse(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) for url in hxs.select( '//div[@id="foodStorageBlock"]//a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_product) def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) shipping_cost = hxs.select( './/a[contains(text(), "Delivery Surcharge")]//../..//td[2]//span/text()' ).extract() if not shipping_cost: shipping_cost = hxs.select( './/td[contains(text(), "Shipping Surcharge")]//..//td[2]//span/text()' ).extract() loader = ProductLoader(item=Product(), selector=hxs) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="ProductNameH1"]/text()') loader.add_value( 'category', hxs.select('//div[@class="breadcrum"]/div/a/text()').extract()[-1]) loader.add_xpath( 'identifier', '//form//input[@id="hdnProdId" or @name="hdnProdId"]/@value') price = hxs.select( './/td[contains(text(), "Price:")]//..//td[2]//span/text()' ).extract() if price: loader.add_value('price', price[0]) else: loader.add_value('price', 0) try: loader.add_value('shipping_cost', shipping_cost[0].strip()) except: pass item = hxs.select('//td/strong') if item and item[0].select('../text()'): loader.add_value( 'sku', item[0].select('../text()').extract()[1].strip('#() ')) image_url = hxs.select( '//div[@id="divImageBlock"]//img/@src').extract() if image_url: loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) loader.add_value('brand', 'Rubbermaid') product = loader.load_item() product['sku'] = product['sku'].upper() metadata = KeterMeta() metadata['brand'] = 'Rubbermaid' metadata['reviews'] = [] product['metadata'] = metadata self.log('>> BROWSER => GET < %s />' % response.url) self._browser.get(response.url) self.log('>> OK') self.log('>> BROWSER => Looking for more reviews ...') try: load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages = 25 while more_reviews and max_pages: self.log('>> More reviews found...') load_more_button.click() self.log('>> BROWSER => CLICK "Load more"') time.sleep(20) self.log('>> OK') load_more_button = self._browser.find_element_by_xpath( '//div[@class="bv-content-pagination"]//button') more_reviews = load_more_button.is_displayed() max_pages -= 1 self.log('>> No more reviews...') except Exception, e: self.log('>> ERROR FOUND => %s' % e) hxs = HtmlXPathSelector(text=self._browser.page_source) for review in hxs.select( '//ol[contains(@class, "bv-content-list-Reviews")]//li[contains(@class, "bv-content-review")]' ): review_loader = ReviewLoader(item=Review(), selector=review, date_format='%m/%d/%Y') review_loader.add_xpath( 'date', u'.//div[@class="bv-content-datetime"][1]//meta[@itemprop="dateCreated"]/@content' ) review_loader.add_xpath( 'full_text', u'.//div[@itemprop="reviewBody"]/p/text()') review_loader.add_xpath( 'rating', u'.//abbr[contains(@class, "bv-rating-stars-on")][1]/@title') review_loader.add_value('url', response.url) product['metadata']['reviews'].append(review_loader.load_item()) yield product
def parse(self, response): hxs = HtmlXPathSelector(response) row = response.meta['row'] products = hxs.select('//div[contains(@class, "tyreResult")]') for product in products: winter = product.select('.//li[@class="cw"]') # skip winter tyres if winter: continue loader = ProductLoader(item=Product(), selector=product) title = ' '.join( map( unicode.strip, product.select('.//div[@class="tyreName"]//text()'). extract())).strip() brand = product.select('@data-brand').extract() if not brand: continue brand = brand[0].title() loader.add_value('brand', unify_brand(brand)) loader.add_value( 'category', find_brand_segment(loader.get_output_value('brand'))) title = title[len(brand):].strip() results = re.search( r"\b((?:\d{2,3}/)*(?:\d{2,3}))\s?([A-Z]{1,2}\d?)\b", title) if results: load_rating = results.group(1) speed_rating = results.group(2) name = title[:results.start(1)] title = title[results.end(2):] else: load_rating = '' speed_rating = row['Speed rating'] name = title title = '' price = ''.join( product.select( './/div[@class="tyrePrice"]//text()').extract()).strip() loader.add_value('price', price) identifier = product.select( './/input[@name="id"]/@value').extract()[0] loader.add_value('identifier', identifier) loader.add_value('url', '') image_url = product.select( './/div[@class="tyreImage"]/img/@src').extract() if image_url: loader.add_value('image_url', urljoin(get_base_url(response), image_url[0])) metadata = MicheldeverMeta() metadata['aspect_ratio'] = row['Aspect Ratio'] metadata['rim'] = row['Rim'] metadata['speed_rating'] = speed_rating metadata['width'] = row['Width'] metadata['fitting_method'] = 'Fitted' metadata['load_rating'] = load_rating # metadata['alternative_speed_rating'] = '' metadata['xl'] = 'Yes' if bool( product.select('./@data-types').re(r'xl')) else 'No' run_flat_found = is_run_flat(title) metadata['run_flat'] = 'Yes' if bool( product.select('./@data-types').re( r'rf')) or run_flat_found else 'No' specif = product.select('.//ul//li/@class').extract() man_code = '' if 'bmw' in specif: man_code = '*' elif 'mer' in specif: man_code = 'MO' elif 'aud' in specif: man_code = 'AO' elif 'por' in specif: man_code = 'NO' for code, man_mark in self.all_man_marks.iteritems(): result, name = cut_name(code, name) if result: if man_code == '': man_code = man_mark break if man_code == '': for code, man_mark in self.all_man_marks.iteritems(): result, title = cut_name(code, title) if result: man_code = man_mark break metadata['manufacturer_mark'] = man_code result, name = cut_name('XL', name) loader.add_value('name', name) metadata['full_tyre_size'] = '/'.join( (row['Width'], row['Aspect Ratio'], row['Rim'], load_rating, speed_rating)) # metadata['alternative_speed_rating'])) prod = loader.load_item() prod['metadata'] = metadata if not is_product_correct(prod): continue prod['metadata']['mts_stock_code'] = find_mts_stock_code( prod, spider_name=self.name, log=self.log) yield prod
def parse_product(self, response): ''' ''' # {{{ #self.log(response.url, level=log.INFO) if response.url == self.store_url + '/': return hxs = HtmlXPathSelector(response) item = ProductItem() # Source item['source'] = self.store_url # Product Name # {{{ tmp = self.extract_xpath(hxs, 'parse_product_product_name') if len(tmp) != 1: raise ValueError('No Product Name') item['product_name'] = tmp[0] # }}} # Product Number # {{{ tmp = self.extract_xpath(hxs, 'parse_product_product_number_deal') # In Stock if len(tmp) == 0: tmp = self.extract_xpath(hxs, 'parse_product_product_number_img') # Out of Stock if len(tmp) == 0: tmp = self.extract_xpath(hxs, 'parse_product_product_number_img_add2wl') # Out of Stock and No Image if len(tmp) == 0: tmp = self.extract_xpath(hxs, 'parse_product_product_number_share') # Out of Stock, No Image and no Add2wl if len(tmp) == 0: raise ValueError('No Product Number') re_share = re.compile('m=tell&p=(\d+)') ms = re_share.search(tmp[0]) tmp = ms.groups() if len(tmp) == 0: raise ValueError('No Product Number') else: re_add2wl = re.compile('pid=(\d+)') ms = re_add2wl.search(tmp[0]) tmp = ms.groups() if len(tmp) == 0: raise ValueError('No Product Number') item['product_number'] = tmp[0] # }}} # Description # {{{ tmp = self.extract_xpath(hxs, 'parse_product_description') if len(tmp) is 0: raise ValueError('No Description') else: item['description'] = '\n'.join(map(lambda s: s.strip(), tmp)).strip() # }}} # Categroy Name # {{{ tmp = self.extract_xpath(hxs, 'parse_product_categories') if len(tmp) <= 0: raise ValueError('No Categories') cg_paths = [] cg_path = [] for c in tmp: c = c.strip() if c == '': continue elif c == 'Home': cg_path = ProductItem.CG_PATH_SEP.join(cg_path) if cg_path != '': cg_paths.append(cg_path) cg_path = ['Home'] else: cg_path.append(c) cg_paths.append(ProductItem.CG_PATH_SEP.join(cg_path)) item['category_name'] = ProductItem.CG_PATHS_SEP.join(cg_paths) # }}} # Product URL item['product_url'] = response.url # Image URL tmp = self.extract_xpath(hxs, 'parse_product_image_url') if len(tmp) is 0: raise ValueError('No Image URL') else: item['image_url'] = self.extract_xpath(hxs, 'parse_product_image_url')[0] # Product Condition item['product_condition'] = ProductItem.PC_NEW # Availability tmp = self.extract_xpath(hxs, 'parse_product_availability') if len(tmp) != 1: raise ValueError('No Availability') else: tmp = self.AVAIL_CHOICES.get(re.sub('\s+', '', tmp[0].lower())) if not tmp: raise ValueError('No such Availability') item['availability'] = tmp # Sale Price tmp = self.extract_xpath(hxs, 'parse_product_sale_price') tmp = re.sub('[$|\s]', '', ''.join(tmp)) item['sale_price'] = float(tmp) # On Sale item['on_sale'] = 0 if len(self.extract_xpath(hxs, 'parse_product_on_sale_img')) > 0 or len(self.extract_xpath(hxs, 'parse_product_on_sale_save')) > 0: item['on_sale'] = 1 # Currency item['currency'] = 'AUD' # Manufacturer item['manufacturer'] = '' ## Optional Field # {{{ #item['gtin'] = None #item['mpn'] = None #item['product_sku'] = None #item['product_spec'] = None #item['cost_price'] = None #item['num_reviews'] = None #item['avg_reviews_points'] = None #item['keywords'] = None # }}} # Shipping Cost # Generate a Request to get the Shipping Cost request = Request(self.SC_URL % (item['product_number']), callback=self.parse_shipping_cost, dont_filter=True) request.meta['item'] = item return request
def parse_categories(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//*[@id="PageMenu"]/div/a/@href').extract() for category in categories: url = urljoin_rfc(get_base_url(response), category) yield Request(url, callback=self.parse_products)
def parse(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) items = hxs.select("//a/@href").extract() for item in items: yield Request(urljoin_rfc(base_url,item), callback=self.parse_items)
def parse(self, response): # currently not extracting parents that are non-links (smaller parent categories like "resources" and "shops") hxs = HtmlXPathSelector(response) # # select all categories (bottom level) # product_links = hxs.select("//div[@id='container']/div/header//nav/ul[@id='nav']//li/a") # select all parent categories #parent_links = hxs.select("//div[@id='container']/div/header//nav/ul[@id='nav']//h4/a") parent_links = hxs.select( "//div[@id='container']/div[@id='header']//nav/ul[@id='nav-touch']//h4/a" ) #TODO: add extraction of level 3 categories (broadest: products, services,...) # items = [] ############################################# # Extract all categories from sitemap instead of menus on category landing pages (lower level as well) # for link in product_links: # # retrieve parent category for this link # parent = link.select("parent::node()/parent::node()/preceding-sibling::node()/a") # item = CategoryItem() # item['text'] = link.select('text()').extract()[0] # item['url'] = link.select('@href').extract()[0] # parent_text = parent.select('text()').extract() # parent_url = parent.select('@href').extract() # if parent_text: # item['parent_text'] = parent_text[0] # if parent_url: # item['parent_url'] = parent_url[0] # # mark it as special if a certain condition is checked # if (link.select("parent::node()/parent::*[@class='nav-res']")): # item['special'] = 1 # #TODO: add its direct parent if it's special (not among the categories). ex: shops, resources... # # get grandparent of the category, mark item as special if grandparent is special # grandparent = parent.select("parent::node()/parent::node()/parent::node()/parent::node()") # if not grandparent.select('@class') or grandparent.select('@class').extract()[0] != 'nav-pro': # item['special'] = 1 # grandparent_text = grandparent.select('a/text()').extract() # grandparent_url = grandparent.select('a/@href').extract() # if grandparent_text: # item['grandparent_text'] = grandparent_text[0] # if grandparent_url: # item['grandparent_url'] = grandparent_url[0] # item['level'] = 0 # items.append(item) ############################################################### department_id = 0 for link in parent_links: item = CategoryItem() item['text'] = link.select('text()').extract()[0] item['url'] = link.select('@href').extract()[0] item['level'] = 1 parent = link.select( "parent::node()/parent::node()/parent::node()/parent::node()") # mark item as special if its parent is special if not parent.select('@class').extract() or parent.select( '@class').extract()[0] != "nav-pro": item['special'] = 1 parent_text = parent.select('a/text()').extract() # they don't actually have a url, only a # #parent_url = parent.select('a/@href').extract() if parent_text: item['parent_text'] = parent_text[0] # if parent_url: # item['parent_url'] = parent_url[0] #items.append(item) department_id += 1 request = Request(item['url'], callback = self.parseCategory, meta = {'parent' : item, 'level' : 1, \ 'department_text' : item['text'], 'department_url' : item['url'], 'department_id' : department_id}) yield request
def parse_product(self, response): base_url = get_base_url(response) hxs = HtmlXPathSelector(response) res = {} options = hxs.select( "//select[@name='SKURecNum']/option/text()").extract() if options: #options name = hxs.select( "//div[@class='buybox']/table/tr/td/h1/text()").extract() if not name: name = hxs.select( "//h1[@class='productNameDN']/text()").extract() url = response.url for option in options: try: name2 = re.match(r'(.*) -.*', option.strip()).group(1) except: continue try: price = re.match( r'.*\xa3(.*)', option.replace("\r", "").replace("\n", "").strip()).group(1) except: price = None if not price: price = "".join( hxs.select( "//p[@class='ProductDetailPrice']/a/font[@class='BodyMain']/text()" ).re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join( hxs.select( '//p[@class="ProductDetailPrice"]/font[1]/b/text()' ).re(r'\xa3([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url, url) res['description'] = name[0].strip() + u' ' + name2 res['price'] = price yield load_product(res, response) else: name = hxs.select( "//div[@class='buybox']/table/tr/td/h1/text()").extract() if not name: name = hxs.select( "//h1[@class='productNameDN']/text()").extract() if not name: name = hxs.select( "//div[@class='buybox']/table/tr/td/table/tr/td/h1/text()" ).extract() url = response.url price = "".join( hxs.select( "//p[@class='ProductDetailPrice']/strong/a/font[@class='BodyMain']/text()" ).re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join( hxs.select( "//p[@class='ProductDetailPrice']/a/font[@class='BodyMain']/text()" ).re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join( hxs.select( '//p[@class="ProductDetailPrice"]/strong/font/b/text()' ).re(r'\xa3([0-9\,\. ]+)')).strip() if not price: price = "".join( hxs.select( '//p[@class="ProductDetailPrice"]/font/b/text()' ).re(r'\xa3([0-9\,\. ]+)')).strip() res['url'] = urljoin_rfc(base_url, url) res['description'] = name[0].strip() res['price'] = price yield load_product(res, response)