def parse_item(self, response): hxs = HtmlXPathSelector(response) meta = response.meta nodes = hxs.xpath("//div[@class='list-infoBox']") city = "".join(hxs.xpath('//a[@class="choose-city"]/span/text()').re(r'\S+')) for node in nodes: items_list = [] title = "".join(node.xpath('.//a[1]/@title').extract()) nowprice = "".join(node.xpath(".//i[@class='fc-org priType']/text()").extract()) url = urljoin(self.start_urls[0],"".join(node.xpath('.//a[1]/@href').extract())) oldprice = "".join(node.xpath('.//p[@class="priType-s"]/s/text()').extract()) drivetime = "".join(node.xpath('.//p[@class="fc-gray"]/descendant::text()').extract()) items_list.append([url,title,nowprice,oldprice,drivetime,city,meta['brand_name']]) writer.writerow([x.encode("utf8").replace("\n","").replace("\t","").replace("\r","").replace(" ","") for x in items_list[0]]) next_page = hxs.xpath('//a[@class="next"]/@href').extract() if next_page: url = urljoin(self.start_urls[0],next_page[0]) yield Request(url,callback=self.parse_item,meta=meta)
def parse_userinfo(self, response): hxs = HtmlXPathSelector(response) item = response.meta["item"] item["userloginname"] = "".join( hxs.xpath("//div[@id='aliasBefore']/strong/text()").extract()) item["usermail"] = "".join( hxs.xpath( u"//span[contains(text(),'邮箱:')]/following-sibling::div[1]/div/strong/text()" ).re(r'\S+')) item["userrealname"] = "".join( hxs.xpath("//input[@id='realName']/@value").extract()) item["usertype"] = "".join( hxs.xpath( u"translate(//div[@class='info-m']/div[contains(text(),'会员类型:')]/text(),'会员类型:','')" ).extract()) safetycenter_url = "https://safe.jd.com/user/paymentpassword/safetyCenter.action" yield Request(url=safetycenter_url, callback=self.parse_safetycenter, meta={"item": item})
def parse(self, response): hxs = HtmlXPathSelector(response) print "*" * 66 print hxs.xpath("//script[@class='J_auto-load']/text()").extract() print "-" * 66 return
def parse_list(self,response): hxs = HtmlXPathSelector(response) all_brand_name = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/text()').re(r'\S+') all_brand_url = hxs.xpath('//span[contains(@class,"brand-all")]/descendant::a/@href').extract() for item in zip(all_brand_name,all_brand_url): if len(item) ==2: yield Request(urljoin(self.start_urls[0], item[1]), callback=self.parse_item,meta={'brand_name':item[0]})
def parse(self, response): hxs = HtmlXPathSelector(response) all_city_name = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/text()').re(r'\S+') all_city_url = hxs.xpath('//div[@class="all-city"]/descendant::a[contains(@data-gzlog,"select_city")]/@href').extract() for item in zip(all_city_name,all_city_url): if len(item) ==2: yield Request(urljoin(self.start_urls[0],item[1]),callback=self.parse_list)
def parse_developer(self, response): item = response.meta.get("item") hxs = HtmlXPathSelector(response) item["stars"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Stars")]/following-sibling::p[1]/text()' ).extract_first() item["watchers"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Watchers")]/following-sibling::p[1]/text()' ).extract_first() item["forks"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Forks")]/following-sibling::p[1]/text()' ).extract_first() item["merged_pull_requests"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Merged Pull Requests")]/following-sibling::p[1]/text()' ).extract_first() item["total_issues"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Total Issues")]/following-sibling::p[1]/text()' ).extract_first() item["closed_issues"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Closed Issues")]/following-sibling::p[1]/text()' ).extract_first() item["contributors"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Contributors")]/following-sibling::p[1]/text()' ).extract_first() item["total_new_commits"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Total new commits")]/following-sibling::p[1]/text()' ).extract_first() yield item self.item_counts += 1 self.logger.info("current item counts <{}>".format(self.item_counts))
def parse_safetycenter(self, response): hxs = HtmlXPathSelector(response) item = response.meta["item"] item["userphone"] = "".join( hxs.xpath("//strong[@id='mobile']/text()").re(r'\S+')) item["useridcard"] = "".join( hxs.xpath( u"//span[contains(text(),'您认证的实名信息:')]/following::strong[2]/text()" ).extract()) order_url = "https://order.jd.com/center/list.action" self.items.append(item) yield Request(url=order_url, callback=self.parse_order_year)
def parse_renqizhishu(self, response): html = HtmlXPathSelector(response) popularity_ranking = ''.join( html.xpath(u"//*[contains(text(),'第')]/text()").extract()) item = {"popularity_ranking": popularity_ranking} self.con.hmset(self.jobid, item) del response
def parse_order_list(self, response): hxs = HtmlXPathSelector(response) orders_urls = hxs.xpath("//a[@name='orderIdLinks']/@href").extract() headers = dict(response.request.headers) headers.update({"Referer": None}) sess = {} cookie = response.request.headers.getlist('Cookie')[0].split(";") for cook in cookie: sess.update({ cook[:cook.index("=")]: cook[cook.index("=") + 1:].replace('"', "") }) for order_url in orders_urls: if "orderId=" in order_url or "orderid" in order_url: #self.queues.push(Request(url=urljoin(self.start_urls[0],order_url),meta={"jobid":self.jobid},headers=headers)) yield Request(url=urljoin(self.start_urls[0], order_url), cookies=sess, meta={"jobid": self.jobid}, callback=self.parse_items) next_page_url = hxs.xpath("//a[@class='next']/@href").extract() if next_page_url: for next_url in next_page_url: yield Request(url=urljoin(self.start_urls[0], next_url), callback=self.parse_order_list)
def parse_community(self, response): item = response.meta.get("item") coin_name = response.meta.get("coin_name") hxs = HtmlXPathSelector(response) item['subscribers'] = hxs.xpath( '//a[@rel="nofollow"][contains(text(),"Subscribers")]/../following-sibling::p[1]/text()' ).extract_first() item["followers"] = hxs.xpath( '//a[@rel="nofollow"][contains(text(),"Followers")]/../following-sibling::p[1]/text()' ).extract_first() item["likes"] = hxs.xpath( '//a[@rel="nofollow"][contains(text(),"Likes")]/../following-sibling::p[1]/text()' ).extract_first() item["avg_users_online"] = hxs.xpath( '//div[contains(@class, "social-media")][contains(text(), "Online")]/p[1]/text()' ).extract_first() item["avg_new_hot_posts_per_hour"] = hxs.xpath( '//div[contains(@class, "social-media")][contains(text(), "New Hot")]/p[1]/text()' ).extract_first() item["avg_new_comments_on_hot_posts_per_hour"] = hxs.xpath( '//div[contains(@class, "col-md")][contains(text(), "Comments")]/p[1]/text()' ).extract_first() url = "https://www.coingecko.com/en/coins/{}/developer#panel".format( coin_name) yield Request(url=url, callback=self.parse_developer, meta={"item": item}, dont_filter=True)
def parse(self, response): hxs = HtmlXPathSelector(response) for data in hxs.xpath("//*[@id='gecko-table']/tbody/tr"): item = IcoItem() item["name"] = data.xpath( './/td[@class="coin-name"]//span[@class="coin-content-name"]/text()' ).extract_first() item["symbol"] = data.xpath( './/td[@class="coin-name"]//span[@class="coin-content-symbol"]/text()' ).extract_first() item["img_url"] = "https:" + data.xpath( './/td[@class="coin-name"]//img/@data-src').extract_first() item["other"] = data.xpath( './/td[@class="coin-name"]//small/text()').extract_first() item["developer"] = data.xpath( './/td[@class="td-developer_score dev"]/div[1]/text()' ).extract_first() item["community"] = data.xpath( './/td[@class="td-community_score community"]/div[1]/text()' ).extract_first() item["public_interest"] = data.xpath( './/td[@class="td-public_interest_score pb-interest"]/div[1]/text()' ).extract_first() item["total"] = data.xpath( './/td[@class="total"]/div[1]/text()').extract_first() coin_name = data.xpath( './/td[@class="coin-name"]//a[@class="currency_exchangable_chart_link"]/@href' ).re(r'price_charts/([\S\s]+?)/usd') url = "https://www.coingecko.com/en/coins/{}#panel".format( coin_name[0]) yield Request(url=url, callback=self.parse_baseinfo, meta={ "item": item, "coin_name": coin_name[0] }, dont_filter=True) next_page_url = hxs.xpath('//link[@rel="next"]/@href').extract_first() self.logger.info("current page url <{}>".format(next_page_url)) yield response.follow(next_page_url, self.parse)
def parse_order_year(self, response): hxs = HtmlXPathSelector(response) order_urls = hxs.xpath( "//div[@class='time-list']/ul/li[position()>1]/a/@_val").extract( )[0:2] order_url = "https://order.jd.com/center/list.action?search=0&d=" for urls in order_urls: yield Request(url=order_url + urls, callback=self.parse_order_list)
def parse_district_num(self, response): html = HtmlXPathSelector(response) district_num = ''.join( html.xpath("//span[@class='num']/text()").extract()).replace( "(", "").replace(")", "") item = {"district_num": district_num} self.con.hmset(self.jobid, item) del response
def parse_baseinfo(self, response): item = response.meta.get("item") coin_name = response.meta.get("coin_name") hxs = HtmlXPathSelector(response) item['liquidity'] = hxs.xpath( '//div[@class="score"][contains(text(), "Liquidity")]/span/text()' ).extract_first() item["hash_algorithm"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Hashing Algorithm")]/following-sibling::p[1]/text()' ).extract_first() item["hash_rate"] = hxs.xpath( '//div[@class="hashrate"]/p/text()').extract_first() item["block_time"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Block Time")]/following-sibling::p[1]/text()' ).extract_first() item["homepage"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Homepage")]/following-sibling::p[1]/a/text()' ).extract_first() item["block_chain_supply"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Blockchain/Supply")]/following-sibling::p[1]/a/text()' ).extract_first() item["discussion_forum"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Discussion Forum")]/following-sibling::p[1]/a/text()' ).extract_first() item["available_total_supply"] = hxs.xpath( '//div[@class="tab-title"][contains(text(), "Available/Total Supply")]/following-sibling::p[1]/text()' ).extract_first() url = "https://www.coingecko.com/en/coins/{}/social#panel".format( coin_name) yield Request(url=url, callback=self.parse_community, meta={ "item": item, "coin_name": coin_name }, dont_filter=True)
def parse_detail(self, response): print "--------------------------" self.logger.info("--------------------------") hxs = HtmlXPathSelector(response) items = [] price_url_pre = "http://p.3.cn/prices/mgets?skuIds=J_" for gl_item in hxs.xpath("//*[@id='plist']/ul/li[@class='gl-item']"): # self.logger.info("GGGGGGGGGGGGGGGGGGGGGGGGG: %s" % gl_item.extract()) book_element = gl_item.xpath("div[@class='tab-content-item j-sku-item tab-cnt-i-selected']") if book_element is None or len(book_element) == 0: book_element = gl_item.xpath("div") data_sku_id = self.get_xpath_val(book_element, "@data-sku") price_url = price_url_pre + data_sku_id item = JdbookItem() item["name"] = self.get_xpath_val(book_element, "div[3]/a/em/text()") item["publisher"] = self.get_xpath_val(book_element, "div[4]/span[2]/a/text()") item["author"] = self.get_xpath_val(book_element, "div[4]/span[1]/span[1]/a[1]/text()") item["commit"] = self.get_xpath_val(book_element, "div[6]/strong/a/text()") item["shop"] = self.get_xpath_val(book_element, "div[7]/span/text()") r = Request(price_url, callback=self.parse_price, dont_filter=True, meta={"item": item}) items.append(item) yield r
def sendphonecode(self, response): hxs = HtmlXPathSelector(response) sendphonecodekey = "".join( hxs.xpath( "translate(//*[@id='sendMobileCode']/@href,'javascript:sendFindPwdCode(|);','')" ).extract()).replace("'", "") if self.vercode is None: item = JdItem() logging.warning( msg= "Login jd need phone vercode, will send phone code to user phone " ) item["status"] = 2 jobid = self.settings.get("jobid", None) item["jobid"] = jobid #self.con.lpush(jobid, item) self.items.append(item) yield Request(url=self.sendcodeurl % sendphonecodekey, dont_filter=True) else: yield Request(url=self.validatecodeurl % (self.vercode, sendphonecodekey), callback=self.checkphonekey)
def parse_starts(self, response): jobid = response.meta["jobid"] html = HtmlXPathSelector(response) meta = response.meta meta["start_5"] = ''.join( html.xpath(u"//a[text()='5星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["strat_4"] = ''.join( html.xpath(u"//a[text()='4星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["start_3"] = ''.join( html.xpath(u"//a[text()='3星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["start_2"] = ''.join( html.xpath(u"//a[text()='2星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") meta["start_1"] = ''.join( html.xpath(u"//a[text()='1星']/following-sibling::em/text()"). extract()).replace(")", "").replace("(", "") next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract()) item = {} item["business_details"] = meta self.items.append(item) yield Request(urljoin(meta["review_urls"], "?pageno=1"), callback=self.parse_item, meta=meta) del response
def parse_ballancecount(self, response): item = JdItem() hxs = HtmlXPathSelector(response) item["spidertime"] = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) item["username"] = self.username item["passwd"] = self.passwd item["usernickname"] = "".join( hxs.xpath("//div[@class='u-name']/a/text()").extract()) item["userrank"] = "".join( hxs.xpath("//div[@class='u-level']/span/a/text()").extract()) item["balance"] = "".join( hxs.xpath("//a[@id='BalanceCount']/text()").extract()) item["baitiaobalance"] = "".join( hxs.xpath("//span[@class='baitiao-limit']/text()").extract()) item["wallet"] = "".join( hxs.xpath("//div[@id='balance']/a[2]/em/text()").extract()) item["yesprofit"] = "".join( hxs.xpath("//div[@class='ftx01 profit']/a/text()").extract()) userinfo_url = 'https://i.jd.com/user/info' yield Request(url=userinfo_url, callback=self.parse_userinfo, meta={"item": item})
def parse_item(self, response): jobid = response.meta["jobid"] html = HtmlXPathSelector(response) meta = response.meta item = {} _item = [] for comment in html.xpath("//*[@class='comment-list']/ul/li"): comments = {} comments["start"] = ''.join( comment.xpath( ".//*[@class='user-info']/span/@title").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["taste"] = ''.join( comment.xpath(u".//*[contains(text(),'口味')]/em/text()"). extract()).replace(")", "").replace("(", "").replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["environment"] = ''.join( comment.xpath(u".//*[contains(text(),'环境')]/em/text()"). extract()).replace(")", "").replace("(", "").replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["service"] = ''.join( comment.xpath(u".//*[contains(text(),'服务')]/em/text()"). extract()).replace(")", "").replace("(", "").replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["review_text"] = ''.join( comment.xpath(".//*[@class='comment-txt']/div/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["review_time"] = ''.join( comment.xpath(".//*[@class='time']/text()").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["discussant"] = ''.join( comment.xpath( ".//*[@class='name']/a/text()").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") comments["discussant_contribution"] = ''.join( comment.xpath(".//*[@class='contribution']/span/@title"). extract()).replace("\r", "").replace("\n", "").replace( "\t", "").replace(" ", "") _item.append(comments) item["review_details"] = _item item["review_page"] = ''.join( html.xpath("//span[@class='PageSel']/text()").extract()) self.items.append(item) next_page = ''.join(html.xpath(u"//a[text()='下一页']/@href").extract()) if next_page: yield Request(urljoin(meta["review_urls"], next_page), callback=self.parse_item, meta=meta) del response
def parse_xiangxi(self, response): html = HtmlXPathSelector(response) meta = response.meta meta["title"] = ''.join( html.xpath("//*[@id='basic-info']/h1/text()").extract()).replace( "\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["start"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[1]/@title").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["mean_price"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[3]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["address"] = ''.join( html.xpath("//*[@id='basic-info']/div[2]/span[2]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["taste"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[4]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["environment"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[5]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["service"] = ''.join( html.xpath("//*[@id='basic-info']/div[1]/span[6]/text()").extract( )).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["tel"] = ''.join( html.xpath("//*[@id='basic-info']/p[1]/span[2]/text()").extract() ).replace("\r", "").replace("\n", "").replace("\t", "").replace(" ", "") meta["review_num"] = ''.join( html.xpath( "//*[@id='comment']/h2/a[2]/span/text()").extract()).replace( ")", "").replace("(", "").replace("\r", "").replace( "\n", "").replace("\t", "").replace(" ", "") # more_review = ''.join(html.xpath("//*[@id='comment']/p/a/@href").extract()) more_review = ''.join( html.xpath(u"//a[contains(text(),'更多点评')]/@href").extract()) review_url = meta["url"] + "/review_more#start=10" meta["review_urls"] = meta["url"] + "/review_more" yield Request(review_url, callback=self.parse_starts, meta=meta) del response
def parse_get_xinxi(self, response): logger.info(msg="<%s>,get HUNANGJJ XX!!!" % self.idcard) # f = open("parse_get_xinxi.html","w") # f.write(response.body.decode("gb2312", "ignore")) # f.close() html = HtmlXPathSelector(response) item = {} item["realName"] = ''.join( html.xpath( u"//td[text()='职工姓名']/../td[2]//text()").extract()).strip() #item["银行账号"] = ''.join(html.xpath("//tr[@class='jtpsoft'][1]/td[4]/font/text()").extract()).strip() item["idCard"] = ''.join( html.xpath( u"//td[text()='身份证号']/../td[2]//text()").extract()).strip() if item["idCard"]: item["birthday"] = item["idCard"][6:14] sex = item["idCard"][-2] if int(sex) % 2 == 1: ####偶数为女,奇数为男 item["sex"] = u"男" elif int(sex) % 2 == 0: item["sex"] = u"女" else: item["sex"] = "" else: item["sex"] = "" item["birthday"] = "" #item["职工账号"] = ''.join(html.xpath("//tr[@class='jtpsoft'][2]/td[4]/font/text()").extract()).strip() item["comName"] = ''.join( html.xpath( u"//td[text()='所在单位']/../td[2]//text()").extract()).strip() item["officeName"] = ''.join( html.xpath( u"//td[text()='所属办事处']/../td[4]//text()").extract()).strip() item["startDate"] = ''.join( html.xpath( u"//td[text()='开户日期']/../td[2]//text()").extract()).strip() item["fundStatus"] = ''.join( html.xpath( u"//td[text()='当前状态']/../td[4]//text()").extract()).strip() item["monthPayBase"] = ''.join( html.xpath( u"//td[text()='月缴基数']/../td[2]//text()").extract()).strip() item["fundRatio"] = ''.join( html.xpath( u"//td[text()='缴存比例']/../td[5]//text()").extract()).strip() item["monthPayRmb"] = ''.join( html.xpath( u"//td[text()='月缴金额']/../td[2]//text()").extract()).strip() item["lastYearBalanceRmb"] = ''.join( html.xpath( u"//*[text()='上年余额']/../../td[4]//text()").extract()).strip() item["comRmb"] = ''.join( html.xpath( u"//td[text()='单位月缴额']/../td[2]//text()").extract()).strip() item["yearRepayRmb"] = ''.join( html.xpath( u"//*[text()='本年补缴']/../../td[4]//text()").extract()).strip() item["perRmb"] = ''.join( html.xpath( u"//td[text()='个人月缴额']/../td[2]//text()").extract()).strip() item["yearDrawRmb"] = ''.join( html.xpath( u"//*[text()='本年支取']/../../td[4]//text()").extract()).strip() item["yearPayRmb"] = ''.join( html.xpath( u"//td[text()='本年缴交']/../td[2]//text()").extract()).strip() item["yearAccrual"] = ''.join( html.xpath( u"//*[text()='本年利息']/../../td[4]//text()").extract()).strip() item["yearSwitchInRmb"] = ''.join( html.xpath( u"//td[text()='本年转入']/../td[2]//text()").extract()).strip() item["balance"] = ''.join( html.xpath( u"//*[text()='公积金余额']/../../td[4]//text()").extract()).strip() item["endDate"] = ''.join( html.xpath( u"//td[text()='缴至年月']/../td[2]//text()").extract()).strip() logger.info(msg="<%s>,get HUNANGJJ XX OVER!!!" % self.idcard) if not item: self.sign = 0 self.item_status["fatch_code"] = 2199 self.con.hmset(self.key, dict(self.item_status)) # print u"采集失败,没采到数据!2199" logger.info(msg="<%s>,get HUNANGJJ XX NO DATA!!!" % self.idcard) else: self.tb = HbClient() self.tb.insert(colname=u"公积金-湖南-信息查询 ", url=response.url, html=response.body, struct_dic={"basic_info": [item]}, id=self.idcard, post_dic=self.form_data, token=self.token) logger.info(msg="<%s>,get HUNANGJJ XX OVER!!!" % self.idcard)
def parse(self, response): html = HtmlXPathSelector(response) if u"没有找到相应的商户" in "".join( html.xpath(u"//h4[contains(text(),'没有找到相应的商户')]/text()")): self.items[0]["status"] = 3 # 3代表没搜到商户 logging.warning(msg="Did not find the corresponding merchant") return city_name = "".join( html.xpath("//a[@class='city J-city']/text()").extract()) meta = { "city": city_name, "vendor_name": self.vendor_name, "district_name": self.district_name, "jobid": self.jobid, "base_url": "http://www.dianping.com", "type": "" } Effective_num = 0 for items in html.xpath("//div[@id='shop-all-list']/ul/li"): title_name = "".join( items.xpath(".//div[@class='tit']/a/h4/text()").extract()) if meta["vendor_name"] in title_name: Effective_num += 1 title_url = "".join( items.xpath(".//div[@class='tit']/a[position()=1]/@href"). extract()) meta["type"] = "".join( items.xpath( ".//div[@class='tag-addr']/a[position()=1]/span/text()" ).extract()).replace("\r", "").replace("\n", "").replace( "\t", "").replace(" ", "") meta["business_district"] = "".join( items.xpath( ".//div[@class='tag-addr']/a[position()=2]/span/text()" ).extract()).replace("\r", "").replace("\n", "").replace( "\t", "").replace(" ", "") meta["url"] = urljoin(meta["base_url"], title_url) yield Request(urljoin(meta["base_url"], title_url), meta=meta, callback=self.parse_xiangxi) if Effective_num == 0: self.items[0]["status"] = 5 # 5代表搜索到商户,但不是指定的商户 logging.warning(msg="Not a specified merchant") return next_page = "".join(html.xpath(u"//a[text()='下一页']/@href").extract()) if next_page: yield Request(urljoin(meta["base_url"], next_page), callback=self.parse, meta=meta) del response
def parse_items(self, response): self.jobid = response.meta["jobid"] item = {} hxs = HtmlXPathSelector(response) item["url"] = response.url item["orderid"] = "".join( hxs.xpath( u"//input[@id='orderid']/@value|//div[@class='w o-detail cj-share']/@orderid|//div[contains(text(),'订单号:')]/text()" ).re(r"%s" % price_re)) if item["orderid"] == "": return Request(url=response.url, meta={"jobid": self.jobid}, callback=self.parse_items, dont_filter=True) item["ordertime"] = "".join( hxs.xpath( u"translate(//li[contains(text(),'下单时间:')]/text()|//input[contains(@id,'datesubmit-')]/@value|//td[contains(text(),'下单时间')]/following-sibling::td[1]/text(),'下单时间:','')" ).re(r'\S+')) item["ordercount"] = "".join( hxs.xpath( u"translate(//li[contains(text(),'充值面额:')]/text()|//span[contains(text(),'商品总额:')]/following-sibling::div[1]/span/text()|//td[contains(text(),'商品金额')]/following-sibling::td[1]/strong/text(),'充值面额:','')" ).re(r"%s" % price_re)) if "orderId" in response.url: item["receivername"] = "".join( hxs.xpath( u"//div[contains(text(),'收货人信息:')]/following-sibling::div[1]/text()" ).re(r'([\S\s]+?)(')) item["receiverphone"] = "".join( hxs.xpath( u"//div[contains(text(),'收货人信息:')]/following-sibling::div[1]/text()" ).re(r"(([\S\s]+?))")) item["receiveraddress"] = "".join( hxs.xpath( u"//div[contains(text(),'收货地址:')]/following-sibling::div[1]/text()" ).re(r'\S+')) item["receiveridno"] = "".join( hxs.xpath( u"translate(//div[contains(text(),'收货人信息:')]/following-sibling::div[2]/text(),',','')" ).re(r"\S+")) item["paycount"] = "".join( hxs.xpath( u"//div[contains(text(),'应付金额:')]/following-sibling::div[1]/b/text()" ).re(r'%s' % price_re)) else: item["paycount"] = "".join( hxs.xpath( u"translate(//li[contains(text(),'在线支付:')]/text()|//span[contains(text(),'应支付金额:')]/following-sibling::div[1]/span/text()|//td[contains(text(),'商品金额')]/following-sibling::td[1]/strong/text(),'在线支付:','')" ).re(r'%s' % price_re)) item["receiveridno"] = "" item["receivername"] = "".join( hxs.xpath( u"//span[contains(text(),'收货人:')]/following-sibling::div[1]/text()|//td[contains(text(),'收货人姓名')]/following-sibling::td[1]/text()" ).re(r'\S+')) item["receiverphone"] = "".join( hxs.xpath( u"translate(//li[contains(text(),'手机号码:')]/text()|//span[contains(text(),'手机号码:')]/following-sibling::div[1]/text()|//td[contains(text(),'固定电话')]/following-sibling::td[1]/text(),'手机号码:','')" ).re(r"\S+")) item["receiveraddress"] = "".join( hxs.xpath( u"//span[contains(text(),'地址:')]/following-sibling::div[1]/text()|//td[contains(text(),'地址')]/following-sibling::td[1]/text()" ).re(r'\S+')) item["paytime"] = "".join( hxs.xpath( u"//span[contains(text(),'付款时间:')]/following-sibling::div[1]/text()|//td[contains(text(),'下单时间')]/following-sibling::td[1]/text()" ).re(r'\S+')) item["billtype"] = "".join( hxs.xpath( u"//span[contains(text(),'发票类型:')]/following-sibling::div[1]/text()|//td[contains(text(),'发票类型')]/following-sibling::td[1]/text()" ).re(r'\S+')) item["billtitle"] = "".join( hxs.xpath( u"//span[contains(text(),'发票抬头:')]/following-sibling::div[1]/text()|//td[contains(text(),'发票抬头')]/following-sibling::td[1]/text()" ).re(r'\S+')) item["billcontent"] = "".join( hxs.xpath( u"//span[contains(text(),'发票内容:')]/following-sibling::div[1]/text()|//td[contains(text(),'发票内容')]/following-sibling::td[1]/text()" ).re(r'\S+')) _item = [] for goods in hxs.xpath( "//tr[contains(@class,'product-')]|//table[@class='tb-void tb-none']/tbody/tr|//td[@class='itemName']/../following-sibling::tr" ): _goods = {} _goods["itemname"] = "".join( goods.xpath(".//a[contains(@href,'item.jd')]/text()").re( r'\S+')) _goods["itemprice"] = "".join( goods.xpath( ".//*[@class='f-price']/text()|.//td[3]/strong/text()|.//*[@class='jdPrice']/text()" ).re(r'%s' % price_re)) _goods["itemnum"] = "".join( goods.xpath(".//td[5]/text()|.//*[@class='num']/text()").re( r'%s' % price_re)) _goods["itemid"] = "".join( goods.xpath(".//a[contains(@href,'item.jd')][1]/@href").re( r'%s' % price_re)) _goods["itemurl"] = urljoin( self.start_urls[0], "".join( goods.xpath( ".//a[contains(@href,'item.jd')][1]/@href").extract())) _item.append(_goods) item["items"] = _item item["jobid"] = self.jobid self.items.append(item)
def parse_get_login(self, response): # f = open("parse_get_login.html","w") # f.write(response.body) # f.close() html = HtmlXPathSelector(response) self.cxyd = "当前年度" self.zgzh = ''.join( html.xpath("//*[@name='zgzh']/@value").extract()) ##职工账户 self.sfzh = ''.join( html.xpath("//*[@name='sfzh']/@value").extract()) ##身份证号码 self.zgxm = ''.join( html.xpath("//*[@name='zgxm']/@value").extract()) ##职工姓名 self.dwbm = ''.join( html.xpath("//*[@name='dwbm']/@value").extract()) ##单位编码? self.zgzt = ''.join( html.xpath("//*[@name='zgzt']/@value").extract()) ##职工状态 ,当前状态 if not self.zgxm and not self.zgzh or '错误' in response.body or '错误' in response.body.decode( "gb2312", "ignore"): logger.error(msg="<%s>,Login HUNANGJJ error, password error!!!" % self.idcard) self.sign = 0 self.item_status["code"] = 2104 self.con.hmset(self.key, dict(self.item_status)) # print u"密码输入错误2104" return else: self.item_status["code"] = 2102 self.con.hmset(self.key, dict(self.item_status)) logger.info(msg="<%s>,Login HUNANGJJ success!!!" % self.idcard) self.sign = 1 # print u"登录成功2102" self.form_data = { "sfzh": str(self.sfzh), "zgxm": str(self.zgxm), "zgzh": str(self.zgzh), "dwbm": str(self.dwbm), "cxyd": str(self.cxyd), "zgzt": str(self.zgzt), } yield FormRequest(url=self.post_xx_url, formdata=self.form_data, callback=self.parse_get_xinxi, dont_filter=True) yield FormRequest(url=self.post_mx_url, formdata=self.form_data, callback=self.parse_get_mx, dont_filter=True) yield FormRequest(url=self.post_dk_url, formdata=self.form_data, callback=self.parse_get_daikuanxinxi, dont_filter=True)
def parse_get_daikuanxinxi(self, response): logger.info(msg="<%s>,get HUNANGJJ DKXX!!!" % self.idcard) # f = open("parse_get_daikuanxinxi.html", "w") # f.write(response.body.decode("gb2312", "ignore")) # f.close() if "该职工没有贷款".encode("gb2312", "ignore") in response.body: logger.info(msg="<%s>,DKXX NO DATA!!!" % self.idcard) # print u"该职工没有贷款" return html = HtmlXPathSelector(response) item = {} #item["贷款合同编号"] = ''.join(html.xpath("//tr[@class='jtpsoft'][1]/td[2]//text()").extract()).strip() item["userName"] = ''.join( html.xpath( u"//td[text()='姓名']/../td[4]//text()").extract()).strip() item["loanMoney"] = ''.join( html.xpath( u"//td[text()='贷款金额']/../td[2]//text()").extract()).strip() loanLimit = ''.join( html.xpath(u"//td[text()='贷款年限']/../td[4]//text()").extract() ).strip() ##贷款年限 if loanLimit: loanLimit_year = loanLimit.replace("年", "").replace(u"年", "") item["loanLimit"] = int(loanLimit_year) * 12 else: item["loanLimit"] = "" item["repaidPrincipal"] = ''.join( html.xpath( u"//td[text()='已还本金']/../td[2]//text()").extract()).strip() item["repaidInterest"] = ''.join( html.xpath( u"//td[text()='已还利息']/../td[4]//text()").extract()).strip() item["loanBalance"] = ''.join( html.xpath( u"//td[text()='贷款余额']/../td[2]//text()").extract()).strip() item["monthLeastRepayment"] = ''.join( html.xpath( u"//td[text()='月最低还款']/../td[4]//text()").extract()).strip() item["overdueMoney"] = ''.join( html.xpath( u"//td[text()='当前逾期金额']/../td[2]//text()").extract()).strip() item["overdueAccrual"] = ''.join( html.xpath( u"//td[text()='当前逾期利息']/../td[4]//text()").extract()).strip() item["repaidDay"] = ''.join( html.xpath( u"//td[text()='月还款日']/../td[2]//text()").extract()).strip() #item["还至年月"] = ''.join(html.xpath("//tr[@class='jtpsoft'][6]/td[4]//text()").extract()).strip() item["loanDay"] = ''.join( html.xpath( u"//td[text()='放款日期']/../td[2]//text()").extract()).strip() item["entrustBank"] = ''.join( html.xpath( u"//td[text()='受托银行']/../td[4]//text()").extract()).strip() item["loanInterestRate"] = ''.join( html.xpath( u"//td[text()='贷款利率']/../td[2]//text()").extract()).strip() item["overdueTimes"] = ''.join( html.xpath( u"//td[text()='当前逾期期数']/../td[4]//text()").extract()).strip() item["repaidType"] = ''.join( html.xpath( u"//td[text()='还款方式']/../td[2]//text()").extract()).strip() item["securityType"] = ''.join( html.xpath( u"//td[text()='担保方式']/../td[4]//text()").extract()).strip() item["loanType"] = ''.join( html.xpath( u"//td[text()='购房类型']/../td[2]//text()").extract()).strip() #item["历史逾期金额"] = ''.join(html.xpath("//tr[@class='jtpsoft'][10]/td[4]//text()").extract()).strip() #item["历史逾期期数"] = ''.join(html.xpath("//tr[@class='jtpsoft'][11]/td[2]//text()").extract()).strip() item["monthHedging"] = ''.join( html.xpath( u"//td[text()='是否办理月对冲']/../td[4]//text()").extract()).strip() logger.info(msg="<%s>,get HUNANGJJ DKXX OVER!!!" % self.idcard) self.tb = HbClient() self.tb.insert(colname=u"公积金-湖南-贷款信息查询 ", url=response.url, html=response.body, struct_dic={"loan_info": [item]}, id=self.idcard, post_dic=self.form_data, token=self.token)
def parse(self, response): self.passwd = urllib.unquote(self.settings.get( "PASSWD", None)) #.decode("ascii").encode("utf8") self.username = urllib.unquote(self.settings.get( "USERNAME", None)) #.decode("ascii").encode("utf8") self.jobid = urllib.unquote(self.settings.get( "JOBID", None)) #.decode("ascii").encode("utf8") self.vercode = urllib.unquote(self.settings.get( "VERCODE", None)) #.decode("ascii").encode("utf8") # self.passwd = "Zqp821907280&@#" # # self.username = "******" # # self.jobid = "y32783y2cnj2neckjn2c" # # self.vercode = "" self.con.hmset(self.jobid, {"status": 0}) hxs = HtmlXPathSelector(response) pubKey = "".join(hxs.xpath('//input[@name="pubKey"]/@value').extract()) keyDER = b64decode(pubKey) keyPub = RSA.importKey(keyDER) nloginpwd = b64encode(rsa.encrypt(b"%s" % self.passwd, keyPub)) self.uuid = "".join( hxs.xpath('//input[@name="uuid"]/@value').extract()) fp = "".join(hxs.xpath('//input[@name="fp"]/@value').extract()) _t = "".join(hxs.xpath('//input[@name="_t"]/@value').extract()) loginType = "".join( hxs.xpath('//input[@name="loginType"]/@value').extract()) eid = "".join(hxs.xpath('//input[@name="eid"]/@value').extract()) self.authcode = "" self.post_data = { "uuid": self.uuid, "eid": eid, "fp": fp, "_t": _t, "loginType": loginType, "loginname": "%s" % self.username, "nloginpwd": nloginpwd, "chkRememberMe": "on", "authcode": self.authcode, } auth_dat = { 'loginName': self.username, } self.sess = {} cookie = [ i.split(";")[0] for i in response.headers.getlist('Set-Cookie') ] for cook in cookie: self.sess.update( {cook[:cook.index("=")]: cook[cook.index("=") + 1:]}) code_url = self.code_url % (random.random(), 2015) yield FormRequest(url=code_url, cookies=self.sess, formdata=auth_dat, callback=self.checkauthcode)
def parse_get_mx(self, response): logger.info(msg="<%s>,get HUNANGJJ MX!!!" % self.idcard) # f = open("parse_get_mx.html", "w") # f.write(response.body.decode("gb2312", "ignore")) # f.close() #print response.body.decode("gb2312", "ignore") html = HtmlXPathSelector(response) list_item = [] for items in html.xpath("//*[@class='jtpsoft']"): item = {} item["payDate"] = ''.join( items.xpath(".//td[1]/text()").extract()) ##支付时间/日期 if not item["payDate"]: continue item["balanceRmb"] = ''.join( items.xpath(".//td[4]/text()").extract()) ##余额 item["payType"] = ''.join( items.xpath(".//td[6]/text()").extract()) #缴费类型/摘要 item["debtorRmb"] = ''.join( items.xpath(".//td[2]/text()").extract()) ##借方金额 item["lenderRmb"] = ''.join( items.xpath(".//td[3]/text()").extract()) ##贷方金额 item["trend"] = ''.join( items.xpath(".//td[5]/text()").extract()) #借贷方向 list_item.append(item) if not list_item: self.sign = 0 self.item_status["fatch_code"] = 2199 self.con.hmset(self.key, dict(self.item_status)) logger.error(msg="<%s>,get HUNANGJJ MX NO DATA!!!" % self.idcard) # print u"采集失败,没采到数据!2199" else: dic_item = {"detail_info": list_item} self.tb = HbClient() self.tb.insert(colname=u"公积金-湖南-明细查询 ", url=response.url, html=response.body, struct_dic=dic_item, id=self.idcard, post_dic=self.form_data, token=self.token) logger.info(msg="<%s>,get HUNANGJJ MX OVER!!!" % self.idcard)