def parse_developer(self, response): item = response.meta.get("item") hxs = HtmlXPathSelector(response) item["stars"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Stars")]/following-sibling::p[1]/text()' ).extract_first() item["watchers"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Watchers")]/following-sibling::p[1]/text()' ).extract_first() item["forks"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Forks")]/following-sibling::p[1]/text()' ).extract_first() item["merged_pull_requests"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Merged Pull Requests")]/following-sibling::p[1]/text()' ).extract_first() item["total_issues"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Total Issues")]/following-sibling::p[1]/text()' ).extract_first() item["closed_issues"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Closed Issues")]/following-sibling::p[1]/text()' ).extract_first() item["contributors"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Contributors")]/following-sibling::p[1]/text()' ).extract_first() item["total_new_commits"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Total new commits")]/following-sibling::p[1]/text()' ).extract_first() yield item self.item_counts += 1 self.logger.info("current item counts <{}>".format(self.item_counts))
def parse_item(self, response): hxs = HtmlXPathSelector(response) meta = response.meta nodes = hxs.xpath("//div[@class='list-infoBox']") city = "".join(hxs.xpath('//a[@class="choose-city"]/span/text()').re(r'\S+')) for node in nodes: items_list = [] title = "".join(node.xpath('.//a[1]/@title').extract()) nowprice = "".join(node.xpath(".//i[@class='fc-org priType']/text()").extract()) url = urljoin(self.start_urls[0],"".join(node.xpath('.//a[1]/@href').extract())) oldprice = "".join(node.xpath('.//p[@class="priType-s"]/s/text()').extract()) drivetime = "".join(node.xpath('.//p[@class="fc-gray"]/descendant::text()').extract()) items_list.append([url,title,nowprice,oldprice,drivetime,city,meta['brand_name']]) writer.writerow([x.encode("utf8").replace("\n","").replace("\t","").replace("\r","").replace(" ","") for x in items_list[0]]) next_page = hxs.xpath('//a[@class="next"]/@href').extract() if next_page: url = urljoin(self.start_urls[0],next_page[0]) yield Request(url,callback=self.parse_item,meta=meta)
def parse(self, response): #print response.url response_selector = HtmlXPathSelector(response) list = response_selector.select(self.list_xpath) #���û��page�����ǵ�һ����ȡ��Ϊ��һҳ try: page = response.meta['page'] except: page = 1 #�������� for sel in list: home = sel.select(self.home_xpath).extract()[0] home_link = home[4:] #print home_link yield scrapy.Request(url=home_link, callback=self.parse_home, meta={"sub_url": response.url}, headers=DEFAULT_REQUEST_HEADERS) #������һҳ��Ϣ���ַ�������PageList(1,10,10,104,'gyyclass=0&keyword=&CY_id=0&city=0',8) nextMsg = response_selector.select(self.nextpage_xpath).extract()[0] # �����ַ������õ���ǰҳ�� currentPage = nextMsg.split(",")[0].split("(")[1] #�����ǰҳ���meta�е�ҳ����ȣ���ȥ��ȡ��һҳ�����Ͳ�������һҳ���������pageΪ12ʱ�����û�е�12ҳ�����������currentPage��Ϊ11����˵����ǰ�Ѿ��������ҳ if cmp(str(page), str(currentPage)) == 0: if page == 1: url = response.url + "?page=" + str(page + 1) else: url = response.url.split("=")[0] + "=" + str(page + 1) # ������һҳ yield scrapy.Request(url=url, callback=self.parse, meta={"page": page + 1}, headers=DEFAULT_REQUEST_HEADERS)
def parse_home(self, response): response_selector = HtmlXPathSelector(response) field_value = {} field_value['url'] = response.url field_value['sub_url'] = response.meta["sub_url"] for field_name in self.home_field_xpath: field_value[field_name] = "" if len( response_selector.select( self.home_field_xpath[field_name]).extract()) > 0: if field_name == "industry": #���Ϊ��ҵ����Ҫ�÷���ƴ��������Ϊ�ַ��� industries = response_selector.select( self.home_field_xpath[field_name]).extract() field_value[field_name] = ",".join(industries) else: field_value[field_name] = response_selector.select( self.home_field_xpath[field_name]).extract()[0] #ƴ��ͼƬ��ַ field_value["image"] = "http://" + response.url.split( "/")[2] + field_value["image"] yield scrapy.Request(url=response.url + "detail/", callback=self.parse_detail, meta=field_value, headers=DEFAULT_REQUEST_HEADERS)
def parse_company(self, response): response_selector = HtmlXPathSelector(response) list = response_selector.select(self.companylist_xpath) #print list for sel in list: item = ParkCompanyItem() company_name = sel.select(self.companyname_xpath).extract()[0] #print company_name # print response.meta["park_md5"] item['company_name'] = company_name item["park_md5"] = response.meta["park_md5"] yield item #��һҳ page = response.meta['page'] #��һҳ��ԭ������ҳ����ͬ if len(response_selector.select(self.nextpage_xpath).extract()) > 0: nextMsg = response_selector.select( self.nextpage_xpath).extract()[0] currentPage = nextMsg.split(",")[0].split("(")[1] if cmp(str(page), str(currentPage)) == 0: if page == 1: url = response.url + "?page=" + str(page + 1) else: url = response.url.split("=")[0] + "=" + str(page + 1) print url yield scrapy.Request(url=url, callback=self.parse_company, meta={ "park_md5": item["park_md5"], "page": page + 1 }, headers=DEFAULT_REQUEST_HEADERS)
def parse_yhzc(self, response): response_selector = HtmlXPathSelector(response) yhzcList = response_selector.select(self.yhzc_xpath).extract() yhzcStr = "".join(yhzcList) message = response.meta #��װ����Ϣ item = ParkBaseItem() item['preferential'] = yhzcStr.decode("utf-8") item['name'] = message['name'] item['level'] = message['level'] item['address'] = message['address'] item['area'] = message['area'] item['industry'] = message['industry'] item['image'] = message['image'] item['detail'] = message['detail'] item['plan'] = message['plan'] item['url'] = message['url'] item["sub_url"] = message["sub_url"] m2 = hashlib.md5() m2.update(item["url"]) item["url_md5"] = m2.hexdigest() item['created'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print item yield item yield scrapy.Request(url=item['url'] + "company/", callback=self.parse_company, meta={ "park_md5": item["url_md5"], "page": 1 }, headers=DEFAULT_REQUEST_HEADERS)
def test_htmlxpathselector(self): with warnings.catch_warnings(record=True): hs = HtmlXPathSelector(text=self.text) self.assertEqual( hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def test_htmlxpathselector(self): with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) hs = HtmlXPathSelector(text=self.text) self.assertEqual(hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def test_htmlxpathselector(self): with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) hs = HtmlXPathSelector(text=self.text) self.assertEqual( hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def parse_cats(self, response): hxs = HtmlXPathSelector(response) item = Link2LinkItem(response.meta['item']) all_product_links = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href").extract() for product_link in all_product_links: yield Request(self.get_url(product_link), callback=self.parse_products, meta={'item': item})
def parse_list(self, response): hxs = HtmlXPathSelector(response) activity_nodes = hxs.select("//li[@class='list-entry']/div[2]/div/a") for node in activity_nodes: activity_url = self.href(node) self.log('Found activity url: %s' % activity_url) yield Request(activity_url, callback = self.parse_activity)
def parse(self, response): hxs = HtmlXPathSelector(response) items = hxs.select('//*[@id="col3_content"]/div/div/div/a') for item in items: url = urljoin(response.url, item.select('./@href').extract()[0]) self.log("Following URL %s" % (url), level=log.DEBUG) yield Request(url, callback=self.parseLineType)
def parse_torrent(self, response): x = HtmlXPathSelector(response) torrent = TorrentItem() torrent['url'] = response.url torrent['name'] = x.select("//h1/text()").extract() torrent['description'] = x.select("//div[@id='description']").extract() torrent['size'] = x.select("//div[@id='info-left']/p[2]/text()[2]").extract() return torrent
def parse(self, response): hxs = HtmlXPathSelector(response) brands = hxs.select("//div[@id='contentFull']/div/p/a/@href") # self.item = Link2LinkItem() item = Link2LinkItem() for brand in brands: brand_page = brand.extract() request = Request(self.get_url(brand_page), callback=self.parse_brands,meta={'item':item}) yield request
def parse(self, response): hxs = HtmlXPathSelector(response) appsl = hxs.select('//div[contains(@class,"loops-wrapper list-thumb-image")]/div') for entry in appsl: item = AppslistItem() item["title"] = entry.select("div/h2/a/text()").extract() item["image_link"] = entry.select("p/a/img/@src").extract() item["desc"] = entry.select("div/p/text()").extract() yield item
def parse_plan(self, response): response_selector = HtmlXPathSelector(response) planList = response_selector.select(self.plan_xpath).extract() planStr = "".join(planList) baseUrl = response.meta['url'] response.meta['plan'] = planStr.decode("utf-8") yield scrapy.Request(url=baseUrl + "yhzc/", callback=self.parse_yhzc, meta=response.meta, headers=DEFAULT_REQUEST_HEADERS)
def parse_carro(self, response): hxs = HtmlXPathSelector(response) carro = Carro() carro['modelo'] = hxs.select('//div[@class="section primary"]/h1/text()').extract()[0] anio, tipo = hxs.select('//div[@class="section primary"]/h4/text()').extract()[0].split("|") carro['anio'] = anio carro['tipo'] = tipo carro['precio'] = hxs.select('//div[@class="section primary"]/h3/text()').extract()[0].split(" ")[-1] carro['url'] = response.url yield carro
def parse_brands(self, response): hxs = HtmlXPathSelector(response) brands = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href") # for brand in brands: item = Link2LinkItem(response.meta['item']) products_category = hxs.select("//*[@id='contentFull']/fieldset[2]/div/p[2]/a/text()").extract() item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() if "Products" in hxs.select("//*[@id='contentFull']/fieldset[2]/legend/text()").extract()[0]: #Catagory exsist, i.e. Dog, Cat all_catagories_links = hxs.select("//*[@id='contentFull']/fieldset[2]/div/p[2]/a/@href").extract() index=0 # c_list=[] for product in products_category: item = Link2LinkItem(response.meta['item']) item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() catatory_link = all_catagories_links[index] item['Products_Category'] = product index = index + 1 # yield item yield Request(self.get_url(catatory_link), callback=self.parse_cats, meta={'item': item}) else: #direct product link is available. item['Brand'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() if "Products_Category" not in item: item['Products_Category'] = "Not Available" all_product_links = hxs.select("//div[@id='contentFull']/fieldset[2]/div/p/a/@href").extract() for product_link in all_product_links: yield Request(self.get_url(product_link), callback=self.parse_products, meta={'item': item})
def parse_detail(self, response): response_selector = HtmlXPathSelector(response) detailList = response_selector.select(self.detail_xpath).extract() detailStr = "".join(detailList) baseUrl = response.meta['url'] response.meta['detail'] = detailStr.decode("utf-8") yield scrapy.Request(url=baseUrl + "plan/", callback=self.parse_plan, meta=response.meta, headers=DEFAULT_REQUEST_HEADERS)
def parse(self, response): hxs = HtmlXPathSelector(response) print "*" * 66 print hxs.xpath("//script[@class='J_auto-load']/text()").extract() print "-" * 66 return
def parse(self, response): hxs = HtmlXPathSelector(response) item = DeputeInfoItem() item["party"] = hxs.select('//td[@id="cbfv_55"]/text()').extract() item["name"] = hxs.select('//h1[@class="title"]/span/span/text()').extract() account = hxs.select('//td[@id="cbfv_60"]/a/text()').extract() print(account) if(len(account) != 0): item["twitter"] = account[0].split('/')[-1] else: item["twitter"] = [] yield item
def parse_renqizhishu(self, response): html = HtmlXPathSelector(response) popularity_ranking = ''.join( html.xpath(u"//*[contains(text(),'第')]/text()").extract()) item = {"popularity_ranking": popularity_ranking} self.con.hmset(self.jobid, item) del response
def parse_district_num(self, response): html = HtmlXPathSelector(response) district_num = ''.join( html.xpath("//span[@class='num']/text()").extract()).replace( "(", "").replace(")", "") item = {"district_num": district_num} self.con.hmset(self.jobid, item) del response
def parseLineType(self, response): hxs = HtmlXPathSelector(response) lineType = hxs.select('//*[@id="pagebar"]/h1/text()').extract()[0].strip() self.log('Processing %s...' % (lineType), level=log.DEBUG) items = hxs.select('//*[@id="tbl_fahrplaene"]/tbody/tr/td[2]/a') for item in items: url = urljoin(response.url, item.select('./@href').extract()[0]) req = Request(url, callback=self.parseFahrplan) req.meta['lineType'] = lineType req.meta['lineName'] = item.select('./text()').extract()[0].strip() self.log("Following URL %s" % (url), level=log.DEBUG) yield req
def parse(self, response): hxs = HtmlXPathSelector(response) all_city_name = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/text()').re(r'\S+') all_city_url = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/@href').extract() for item in zip(all_city_name,all_city_url): if len(item) ==2: yield Request(urljoin(self.start_urls[0],item[1]),callback=self.parse_list)
def parse_order_year(self, response): hxs = HtmlXPathSelector(response) order_urls = hxs.xpath( "//div[@class='time-list']/ul/li[position()>1]/a/@_val").extract( )[0:2] order_url = "https://order.jd.com/center/list.action?search=0&d=" for urls in order_urls: yield Request(url=order_url + urls, callback=self.parse_order_list)
def parse_list(self,response): hxs = HtmlXPathSelector(response) all_brand_name = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/text()').re(r'\S+') all_brand_url = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/@href').extract() for item in zip(all_brand_name,all_brand_url): if len(item) ==2: yield Request(urljoin(self.start_urls[0], item[1]), callback=self.parse_item,meta={'brand_name':item[0]})
def parse_community(self, response): item = response.meta.get("item") coin_name = response.meta.get("coin_name") hxs = HtmlXPathSelector(response) item['subscribers'] = hxs.xpath( '//a[@rel="nofollow"][contains(text(),"Subscribers")]/../following-sibling::p[1]/text()' ).extract_first() item["followers"] = hxs.xpath( '//a[@rel="nofollow"][contains(text(),"Followers")]/../following-sibling::p[1]/text()' ).extract_first() item["likes"] = hxs.xpath( '//a[@rel="nofollow"][contains(text(),"Likes")]/../following-sibling::p[1]/text()' ).extract_first() item["avg_users_online"] = hxs.xpath( '//div[contains(@class, "social-media")][contains(text(), "Online")]/p[1]/text()' ).extract_first() item["avg_new_hot_posts_per_hour"] = hxs.xpath( '//div[contains(@class, "social-media")][contains(text(), "New Hot")]/p[1]/text()' ).extract_first() item["avg_new_comments_on_hot_posts_per_hour"] = hxs.xpath( '//div[contains(@class, "col-md")][contains(text(), "Comments")]/p[1]/text()' ).extract_first() url = "https://www.coingecko.com/en/coins/{}/developer#panel".format( coin_name) yield Request(url=url, callback=self.parse_developer, meta={"item": item}, dont_filter=True)
def parse_products(self, response): hxs = HtmlXPathSelector(response) item = Link2LinkItem(response.meta["item"]) item['Specification'] = hxs.select("//div[@id='tab1']/p/text()").extract() item['Product_Name'] = hxs.select("//*[@id='contentFull']/h1/text()").extract() ga = hxs.select("//*[@id='tab2']/p/text()").extract() if ga: item['Guaranteed_Analysis'] = ga else: item['Guaranteed_Analysis'] = "Not Available" item['Product_Description'] = hxs.select(".//*[@id='contentFull']/p/text()").extract() yield item
def scrapeFlights(self, response): """Scrape the flights into a Fare() object.""" html = HtmlXPathSelector(response) errors = html.select("//ul[@id='errors']/li/text()") # Catch errors given by Southwest's page if (len(errors) > 0): self.log("Error: %s" % theError, level=log.ERROR) return # Conveniently packaged flight info in string form for form submission xpath = '//div[@class="productPricing"]//input/@title' selectors = [ '//table[@id="faresOutbound"]//td[@class="price_column "]' + xpath, # business select '//table[@id="faresOutbound"]//td[@class="price_column"][1]' + xpath, # anytime '//table[@id="faresOutbound"]//td[@class="price_column"][2]' + xpath # wanna get away ] fareList = [] for selector in selectors: fareList.append(html.select(selector).extract()) # Process that info and load into a Fare() item. i = 0 fareType = ["Business Select", "Anytime", "Wanna Get Away"] for fare in fareList: self.log("Faretype: %d %s" % (i, fareType[i])) for flightString in fare: if (flightString[0] == 'D'): flightData = Util.parseFlight(flightString, self.outDate.date()) self.log("Found: %s" % flightString) flight = Fare() for key in flightData: flight[key] = flightData[key] flight['origin'] = self.origin flight['destination'] = self.destination flight['date'] = self.outDate flight['faretype'] = fareType[i] self.log('Added') yield flight else: continue i += 1
def parse(self, response): hxs = HtmlXPathSelector(response) last_page_node = hxs.select("//div[@class='paginator']/a[last()]") last_page = int(self.href(last_page_node).split('=')[1]) pages = range(10, last_page, 10) #pages = [] for page in pages: page_url = response.url + '?start=' + str(page) self.log('Found page url: %s' % page_url) yield Request(page_url, callback = self.parse_list) # process first page # self.parse_list(response) yield Request(response.url, callback=self.parse_list)
def parse(self, response): hxs = HtmlXPathSelector(response) data = {} data['key'] = response.url data['s'] = 1 if (MongoDbUtil.MongoDbConnect.count(self.modle, data) == 0): print '-------------------------------------------', response.url self.parseAllA(response.url, hxs) try: MongoDbUtil.MongoDbConnect.update(self.modle, data) self.finishUrls.append(response.url) except Exception as e: print response.url, ":", e data = {} data['key'] = response.url data['type'] = self.error_url data['e'] = e MongoDbUtil.MongoDbConnect.save(self.modle_error, data) data = {} data['s'] = 0 urls = MongoDbUtil.MongoDbConnect.list(self.modle, data) try: yield Request(urls[0]['key'], callback=self.parse) except Exception as e: print "yield Request :", e
def commons_speech_feeder(working_dir, _fetch_url=None): """Return a generator that yields file urls""" # TODO: find a faster way of doing this if not _fetch_url: _fetch_url = fetch_url list_url = 'http://ukparse.kforge.net/parldata/scrapedxml/debates/' log.debug("Fetching index") data = _fetch_url(list_url, "Failed to fetch index.") if data: hxs = HtmlXPathSelector(text=unicode(data, errors="ignore")) selector = hxs.select(r'//table//td//a/@href') check_href = create_href_checker(re.compile(r'^debates\d{4}'), working_dir) urls = selector.extract() log.debug("Fetched %s urls from index" % len(urls)) for href in urls: if check_href(href): yield urlparse.urljoin(list_url, href)
def parse(self, response): hxs = HtmlXPathSelector(response) car_lis = hxs.select('//li[@class="standard_200 t gal-normal-mot"]') #Parsear cada li de carro for li in car_lis: carro = Carro() carro['modelo'] = li.select('div/h3/a/text()').extract()[0] anio, tipo = li.select('div[@class="itemInfo"]//h4/strong/text()').extract()[0].split("|") carro['anio'] = anio carro['tipo'] = tipo carro['precio'] = li.select('div[@class="itemInfo"]//li[@class="precio_gal"]/strong/text()' ).extract()[0].split(" ")[-1] carro['url'] = li.select('div/h3/a/@href').extract()[0] yield carro url_paginas = hxs.select("//div[@id='paginador']/a/@href").extract() for pagina in url_paginas: yield(Request(pagina, callback=self.parse))
def scrapeFlights(self, response): """Scrape the flights into a Fare() object.""" html = HtmlXPathSelector(response) errors = html.select("//ul[@id='errors']/li/text()") # Catch errors given by Southwest's page if ( len(errors) > 0 ): self.log("Error: %s" % theError , level=log.ERROR) return # Conveniently packaged flight info in string form for form submission xpath = '//div[@class="productPricing"]//input/@title' selectors = [ '//table[@id="faresOutbound"]//td[@class="price_column "]' + xpath, # business select '//table[@id="faresOutbound"]//td[@class="price_column"][1]' + xpath, # anytime '//table[@id="faresOutbound"]//td[@class="price_column"][2]' + xpath # wanna get away ] fareList = [] for selector in selectors: fareList.append( html.select(selector).extract() ) # Process that info and load into a Fare() item. i = 0 fareType = ["Business Select", "Anytime", "Wanna Get Away"] for fare in fareList: self.log("Faretype: %d %s" % (i, fareType[i]) ) for flightString in fare: if ( flightString[0] == 'D' ): flightData = Util.parseFlight(flightString, self.outDate.date()) self.log("Found: %s" % flightString) flight = Fare() for key in flightData: flight[key] = flightData[key] flight['origin'] = self.origin flight['destination'] = self.destination flight['date'] = self.outDate flight['faretype'] = fareType[i] self.log('Added') yield flight else: continue i += 1
def parse_safetycenter(self, response): hxs = HtmlXPathSelector(response) item = response.meta["item"] item["userphone"] = "".join( hxs.xpath("//strong[@id='mobile']/text()").re(r'\S+')) item["useridcard"] = "".join( hxs.xpath( u"//span[contains(text(),'您认证的实名信息:')]/following::strong[2]/text()" ).extract()) order_url = "https://order.jd.com/center/list.action" self.items.append(item) yield Request(url=order_url, callback=self.parse_order_year)
def parse_list(self, response): hxs = HtmlXPathSelector(response) # print(hxs.extract()) body = hxs.select('//div[@id="contents"]') # print(body.extract()) item = MovieItem() item['title'] = body.select('//div[@id="bo_v_title"]/h1/text()').extract()[0] item['link'] = body.select('//div[@class="bo_v_file"]/a/@href').extract()[1] item['date'] = body.select('//div[@id="bo_v_info"]/table/tbody/tr/td/table/tbody/tr/td/text()').extract()[2] if not item['title']: print("Table Title") else: print(item) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) for data in hxs.xpath("//*[@id='gecko-table']/tbody/tr"): item = IcoItem() item["name"] = data.xpath( './/td[@class="coin-name"]//span[@class="coin-content-name"]/text()' ).extract_first() item["symbol"] = data.xpath( './/td[@class="coin-name"]//span[@class="coin-content-symbol"]/text()' ).extract_first() item["img_url"] = "https:" + data.xpath( './/td[@class="coin-name"]//img/@data-src').extract_first() item["other"] = data.xpath( './/td[@class="coin-name"]//small/text()').extract_first() item["developer"] = data.xpath( './/td[@class="td-developer_score dev"]/div[1]/text()' ).extract_first() item["community"] = data.xpath( './/td[@class="td-community_score community"]/div[1]/text()' ).extract_first() item["public_interest"] = data.xpath( './/td[@class="td-public_interest_score pb-interest"]/div[1]/text()' ).extract_first() item["total"] = data.xpath( './/td[@class="total"]/div[1]/text()').extract_first() coin_name = data.xpath( './/td[@class="coin-name"]//a[@class="currency_exchangable_chart_link"]/@href' ).re(r'price_charts/([\S\s]+?)/usd') url = "https://www.coingecko.com/en/coins/{}#panel".format( coin_name[0]) yield Request(url=url, callback=self.parse_baseinfo, meta={ "item": item, "coin_name": coin_name[0] }, dont_filter=True) next_page_url = hxs.xpath('//link[@rel="next"]/@href').extract_first() self.logger.info("current page url <{}>".format(next_page_url)) yield response.follow(next_page_url, self.parse)
def parse_order_list(self, response): hxs = HtmlXPathSelector(response) orders_urls = hxs.xpath("//a[@name='orderIdLinks']/@href").extract() headers = dict(response.request.headers) headers.update({"Referer": None}) sess = {} cookie = response.request.headers.getlist('Cookie')[0].split(";") for cook in cookie: sess.update({ cook[:cook.index("=")]: cook[cook.index("=") + 1:].replace('"', "") }) for order_url in orders_urls: if "orderId=" in order_url or "orderid" in order_url: #self.queues.push(Request(url=urljoin(self.start_urls[0],order_url),meta={"jobid":self.jobid},headers=headers)) yield Request(url=urljoin(self.start_urls[0], order_url), cookies=sess, meta={"jobid": self.jobid}, callback=self.parse_items) next_page_url = hxs.xpath("//a[@class='next']/@href").extract() if next_page_url: for next_url in next_page_url: yield Request(url=urljoin(self.start_urls[0], next_url), callback=self.parse_order_list)
def parse_userinfo(self, response): hxs = HtmlXPathSelector(response) item = response.meta["item"] item["userloginname"] = "".join( hxs.xpath("//div[@id='aliasBefore']/strong/text()").extract()) item["usermail"] = "".join( hxs.xpath( u"//span[contains(text(),'邮箱:')]/following-sibling::div[1]/div/strong/text()" ).re(r'\S+')) item["userrealname"] = "".join( hxs.xpath("//input[@id='realName']/@value").extract()) item["usertype"] = "".join( hxs.xpath( u"translate(//div[@class='info-m']/div[contains(text(),'会员类型:')]/text(),'会员类型:','')" ).extract()) safetycenter_url = "https://safe.jd.com/user/paymentpassword/safetyCenter.action" yield Request(url=safetycenter_url, callback=self.parse_safetycenter, meta={"item": item})
def _parse_jd(self, response): r = response.request hxs = HtmlXPathSelector(response) items = hxs.select("//li[@class='item']/div[@class='store']") if items: yield self._jd_request(r.meta['level'], r.meta['id'], r.meta['name'], r.meta['page']+1) for item in items: name = item.select('.//h3/a/text()').extract()[0] rating = item.select(".//div[@class='rank']/span/@class").extract()[0] tags = item.select(".//div[@class='tag']/a/text()").extract() path = item.select('.//h3/a/@href').extract()[0] place = MafengwoPlace() place['name'] = name if rating: place['rating'] = int(rating[4:]) place['tags'] = tags place['id'] = self._extract_id(path) place['p_id'] = r.meta['id'] place['level'] = self._sub_level(r.meta['level']) yield self._jd_detail_request(place, path)
def parse_detail(self, response): print "--------------------------" self.logger.info("--------------------------") hxs = HtmlXPathSelector(response) items = [] price_url_pre = "http://p.3.cn/prices/mgets?skuIds=J_" for gl_item in hxs.xpath("//*[@id='plist']/ul/li[@class='gl-item']"): # self.logger.info("GGGGGGGGGGGGGGGGGGGGGGGGG: %s" % gl_item.extract()) book_element = gl_item.xpath("div[@class='tab-content-item j-sku-item tab-cnt-i-selected']") if book_element is None or len(book_element) == 0: book_element = gl_item.xpath("div") data_sku_id = self.get_xpath_val(book_element, "@data-sku") price_url = price_url_pre + data_sku_id item = JdbookItem() item["name"] = self.get_xpath_val(book_element, "div[3]/a/em/text()") item["publisher"] = self.get_xpath_val(book_element, "div[4]/span[2]/a/text()") item["author"] = self.get_xpath_val(book_element, "div[4]/span[1]/span[1]/a[1]/text()") item["commit"] = self.get_xpath_val(book_element, "div[6]/strong/a/text()") item["shop"] = self.get_xpath_val(book_element, "div[7]/span/text()") r = Request(price_url, callback=self.parse_price, dont_filter=True, meta={"item": item}) items.append(item) yield r
def sendphonecode(self, response): hxs = HtmlXPathSelector(response) sendphonecodekey = "".join( hxs.xpath( "translate(//*[@id='sendMobileCode']/@href,'javascript:sendFindPwdCode(|);','')" ).extract()).replace("'", "") if self.vercode is None: item = JdItem() logging.warning( msg= "Login jd need phone vercode, will send phone code to user phone " ) item["status"] = 2 jobid = self.settings.get("jobid", None) item["jobid"] = jobid #self.con.lpush(jobid, item) self.items.append(item) yield Request(url=self.sendcodeurl % sendphonecodekey, dont_filter=True) else: yield Request(url=self.validatecodeurl % (self.vercode, sendphonecodekey), callback=self.checkphonekey)
def parse_activity(self, response): hxs = HtmlXPathSelector(response) event_info_node = hxs.select('//div[@class="event-info"]') title = self.text(event_info_node.select('h1')) event_detail_nodes = event_info_node.select('div[@class="event-detail"]') details = {} for n in event_detail_nodes: self.log(n.extract()) key = self.text(n.select('span')).strip().replace(':','') value = re.sub('<[^<]+?>', '', n.extract()).split(':')[1].strip() details[key] = value description = '\n'.join(hxs.select('//div[@class="related_info"]/div/div/text()').extract()) photo_urls = hxs.select('//ul[contains(@class,"event-detail-photo")]/li/a/img/@src').extract() photo_urls = map(lambda x:x.replace('albumicon', 'photo'), photo_urls) entry = Activity() entry['id'] = response.url entry['title'] = title entry['description'] = description entry['images'] = photo_urls entry['details'] = details return entry
def parse_next_page(self, response): print ("Fetch group home page: %s" % response.url) hxs = HtmlXPathSelector(response) pages_num = hxs.select("//*[@id='J_bottomPage']/span[2]/em[1]/b/text()").extract()[0] print pages_num page_url = hxs.select("//*[@id='J_bottomPage']/span[1]/a[2]/@href").extract()[0] print page_url PAGE_PATTERN = r"(.+page=)[\d]+(.+)" parse = re.compile(PAGE_PATTERN, re.UNICODE | re.VERBOSE) match = parse.search(page_url) if match: pre_url = match.group(1) post_url = match.group(2) for i in xrange(1, int(pages_num) + 1): # iterate each page page_url = "http://list.jd.com" + pre_url + str(i) + post_url r = Request(page_url, callback=self.parse_detail) yield r else: print "NONONONO!"
def parse_baseinfo(self, response): item = response.meta.get("item") coin_name = response.meta.get("coin_name") hxs = HtmlXPathSelector(response) item['liquidity'] = hxs.xpath( '//div[@class="score"][contains(text(), "Liquidity")]/span/text()' ).extract_first() item["hash_algorithm"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Hashing Algorithm")]/following-sibling::p[1]/text()' ).extract_first() item["hash_rate"] = hxs.xpath( '//div[@class="hashrate"]/p/text()').extract_first() item["block_time"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Block Time")]/following-sibling::p[1]/text()' ).extract_first() item["homepage"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Homepage")]/following-sibling::p[1]/a/text()' ).extract_first() item["block_chain_supply"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Blockchain/Supply")]/following-sibling::p[1]/a/text()' ).extract_first() item["discussion_forum"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Discussion Forum")]/following-sibling::p[1]/a/text()' ).extract_first() item["available_total_supply"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Available/Total Supply")]/following-sibling::p[1]/text()' ).extract_first() url = "https://www.coingecko.com/en/coins/{}/social#panel".format( coin_name) yield Request(url=url, callback=self.parse_community, meta={ "item": item, "coin_name": coin_name }, dont_filter=True)
def parse_starts(self, response): jobid = response.meta["jobid"] html = HtmlXPathSelector(response) meta = response.meta meta["start_5"] = ''.join( html.xpath(u"//a[text()='5星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["strat_4"] = ''.join( html.xpath(u"//a[text()='4星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["start_3"] = ''.join( html.xpath(u"//a[text()='3星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["start_2"] = ''.join( html.xpath(u"//a[text()='2星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["start_1"] = ''.join( html.xpath(u"//a[text()='1星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract()) item = {} item["business_details"] = meta self.items.append(item) yield Request(urljoin(meta["review_urls"], "?pageno=1"), callback=self.parse_item, meta=meta) del response
def parse_ballancecount(self, response): item = JdItem() hxs = HtmlXPathSelector(response) item["spidertime"] = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) item["username"] = self.username item["passwd"] = self.passwd item["usernickname"] = "".join( hxs.xpath("//div[@class='u-name']/a/text()").extract()) item["userrank"] = "".join( hxs.xpath("//div[@class='u-level']/span/a/text()").extract()) item["balance"] = "".join( hxs.xpath("//a[@id='BalanceCount']/text()").extract()) item["baitiaobalance"] = "".join( hxs.xpath("//span[@class='baitiao-limit']/text()").extract()) item["wallet"] = "".join( hxs.xpath("//div[@id='balance']/a[2]/em/text()").extract()) item["yesprofit"] = "".join( hxs.xpath("//div[@class='ftx01 profit']/a/text()").extract()) userinfo_url = 'https://i.jd.com/user/info' yield Request(url=userinfo_url, callback=self.parse_userinfo, meta={"item": item})
def parse_xiangxi(self, response): html = HtmlXPathSelector(response) meta = response.meta meta["title"] = ''.join( html.xpath("//*[@id='basic-info']/h1/text()").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["start"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[1]/@title").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["mean_price"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[3]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["address"] = ''.join( html.xpath("//*[@id='basic-info']/div[2]/span[2]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["taste"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[4]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["environment"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[5]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["service"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[6]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["tel"] = ''.join( html.xpath("//*[@id='basic-info']/p[1]/span[2]/text()").extract() ).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["review_num"] = ''.join( html.xpath( "//*[@id='comment']/h2/a[2]/span/text()").extract()).replace( ")", "").replace("(", "").replace("\r", "").replace( "\n", "").replace("\t", "").replace(" ", "") # more_review = ''.join(html.xpath("//*[@id='comment']/p/a/@href").extract()) more_review = ''.join( html.xpath(u"//a[contains(text(),'更多点评')]/@href").extract()) review_url = meta["url"] + "/review_more#start=10" meta["review_urls"] = meta["url"] + "/review_more" yield Request(review_url, callback=self.parse_starts, meta=meta) del response
def test_htmlxpathselector(self): with warnings.catch_warnings(record=True): hs = HtmlXPathSelector(text=self.text) self.assertEqual(hs.select("//div").extract(), [u'<div><img src="a.jpg"><p>Hello</p></div>']) self.assertRaises(RuntimeError, hs.css, 'div')
def parse_fund(self, response): x = HtmlXPathSelector(response) fund = HlscraperItem() fund['Url'] = response.url fund['Name'] = x.select("normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())").extract() fund['ExdividendDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())").extract() fund['PaymentDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())").extract() fund['RunningYield'] = x.select("normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())").extract() fund['HistoricYield'] = x.select("normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())").extract() fund['IncomePaid'] = x.select("normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())").extract() fund['TypeOfPayment'] = x.select("normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())").extract() fund['LaunchDate'] = x.select("normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())").extract() fund['Sector'] = x.select("normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())").extract() fund['FundSize'] = x.select("normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())").extract() fund['NumberOfHoldings'] = x.select("normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())").extract() fund['TypeOfUnits'] = x.select("normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())").extract() fund['FundType'] = x.select("normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())").extract() fund['NetInitialCharge'] = x.select("normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())").extract() fund['NetAnnualCharge'] = x.select("normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())").extract() fund['OtherExpenses'] = x.select("normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())").extract() fund['PerformanceFee'] = x.select("normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())").extract() fund['PlatformFee'] = x.select("normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())").extract() fund['Wealth150'] = x.select("/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src").extract() return fund
def parse_item(self, response): jobid = response.meta["jobid"] html = HtmlXPathSelector(response) meta = response.meta item = {} _item = [] for comment in html.xpath("//*[@class='comment-list']/ul/li"): comments = {} comments["start"] = ''.join( comment.xpath( ".//*[@class='user-info']/span/@title").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["taste"] = ''.join( comment.xpath(u".//*[contains(text(),'口味')]/em/text()"). extract()).replace(")", "").replace("(", "").replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["environment"] = ''.join( comment.xpath(u".//*[contains(text(),'环境')]/em/text()"). extract()).replace(")", "").replace("(", "").replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["service"] = ''.join( comment.xpath(u".//*[contains(text(),'服务')]/em/text()"). extract()).replace(")", "").replace("(", "").replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["review_text"] = ''.join( comment.xpath(".//*[@class='comment-txt']/div/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["review_time"] = ''.join( comment.xpath(".//*[@class='time']/text()").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["discussant"] = ''.join( comment.xpath( ".//*[@class='name']/a/text()").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["discussant_contribution"] = ''.join( comment.xpath(".//*[@class='contribution']/span/@title"). extract()).replace("\r", "").replace("\n", "").replace( "\t", "").replace(" ", "") _item.append(comments) item["review_details"] = _item item["review_page"] = ''.join( html.xpath("//span[@class='PageSel']/text()").extract()) self.items.append(item) next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract()) if next_page: yield Request(urljoin(meta["review_urls"], next_page), callback=self.parse_item, meta=meta) del response
def parse_fund(self, response): x = HtmlXPathSelector(response) fund = HlscraperItem() fund['Url'] = response.url fund['Name'] = x.select( "normalize-space(/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/text())" ).extract() fund['ExdividendDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Ex-dividend date')]]/../td/text())" ).extract() fund['PaymentDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Payment date')]]/../td/text())" ).extract() fund['RunningYield'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Running yield')]]/../td/text())" ).extract() fund['HistoricYield'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Historic yield')]]/../td/text())" ).extract() fund['IncomePaid'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Income paid')]]/../td/text())" ).extract() fund['TypeOfPayment'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Type of payment')]]/../td/text())" ).extract() fund['LaunchDate'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Launch date')]]/../td/text())" ).extract() fund['Sector'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Sector')]]/../td/text())" ).extract() fund['FundSize'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Fund size')]]/../td/text())" ).extract() fund['NumberOfHoldings'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Number of holdings')]]/../td/text())" ).extract() fund['TypeOfUnits'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Type of units')]]/../td/text())" ).extract() fund['FundType'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Fund type')]]/../td/text())" ).extract() fund['NetInitialCharge'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Net initial charge')]]/../td/text())" ).extract() fund['NetAnnualCharge'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Net Annual charge')]]/../td/text())" ).extract() fund['OtherExpenses'] = x.select( "normalize-space(//tr/th[text()[contains(., \"Fund manager's other expenses\")]]/../td/text())" ).extract() fund['PerformanceFee'] = x.select( "normalize-space(//tr/th[text()[contains(., 'Performance fee')]]/../td/text())" ).extract() fund['PlatformFee'] = x.select( "normalize-space(//tr/th[text()[contains(., 'HL Platform charge')]]/../td/text())" ).extract() fund['Wealth150'] = x.select( "/html/body/div[@id='container']/div[@id='content']/div[@class='spacer-left-dbl']/div[@id='fund-section-content']/div[@class='spacer-bottom']/div[@id='security-title']/h1[@class='underline']/a/img/@src" ).extract() return fund