def parse_showdesk_members_treat(self, resp): hxs = Selector(resp) next_page_nodes = hxs.xpath('//a[@class="next_page"]') meta = resp.meta if next_page_nodes and meta['page'] == 1: next_page_node = next_page_nodes[0] total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip() for i in xrange(2, int(total_page) + 1): new_meta = dict(meta) new_meta['page'] = i self.log('%s yield member list page %d' % (self.name, i)) yield FormRequest(url="http://vip6.sentree.com.cn/shair/timesItem!initTreat.action", formdata={ 'page.currNum' : str(i), 'page.rpp' : '30', 'r' : str(meta['r']), 'set' : 'manage' }, callback=self.parse_showdesk_members_treat, meta=new_meta) treat_info_tabs = hxs.xpath('//div[@class="page_main"]//div[@class="table-responsive"]/table') if not treat_info_tabs: yield None return treat_info_tab = treat_info_tabs[0] ths = str_list_strip_replace(treat_info_tab.xpath('./thead/tr/th/child::text()').extract(), [' ', '\t', '\n', ' ']) info_nodes = treat_info_tab.xpath('./tbody/tr') for i_n in info_nodes: infos = [] info_tds = i_n.xpath('./td') for i_t in info_tds: info = ''.join(str_list_strip_replace(i_t.xpath('.//child::text()').extract(), [' ', '\t', '\n', ' '])) infos.append(info) item = SentreeMemberTreatItem() item['hs'] = ths item['vals'] = infos yield item
def parse_showdesk_membercards(self, resp): hxs = Selector(resp) headers = hxs.xpath('//form[@id="cardTypeForm"]//table/thead/tr/th/child::text()').extract() if not headers: self.log('%s can not find table headers.' % self.name, level=log.ERROR) yield None return employee_nodes = hxs.xpath('//form[@id="cardTypeForm"]//table/tbody/tr') if not employee_nodes: self.log('%s can not find member card info' % self.name, level=log.ERROR) yield None return for e_n in employee_nodes: info_nodes = e_n.xpath('td') info = OrderedDict({}) for idx, i_n in enumerate(info_nodes): if idx == 0 or idx == len(info_nodes) - 2: continue if idx == len(info_nodes) - 1: info[headers[idx]] = ' | '.join(str_list_strip_replace(i_n.xpath('./child::text()').extract(), [' ', '\t', '\n', ' '])) continue sep = ' | ' if idx == 3: sep = '' info[headers[idx]] = sep.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n', ' '])) item = SentreeMemberCardItem() item['info'] = info # items.append(info) yield item
def parse_showdesk_members2(self, resp): hxs = Selector(resp) next_page_nodes = hxs.xpath('//a[@class="next_page"]') meta = resp.meta if next_page_nodes and meta['page'] == 1: next_page_node = next_page_nodes[0] total_page = next_page_node.xpath('./parent::li/preceding-sibling::li')[-1].xpath('a/child::text()').extract()[0].strip() for i in xrange(2, int(total_page) + 1): new_meta = dict(meta) new_meta['page'] = i self.log('%s yield member list page %d' % (self.name, i)) yield FormRequest(url="http://vip6.sentree.com.cn/shair/memberInfo!memberlist.action", formdata={ 'page.currNum' : str(i), 'page.rpp' : '30', 'r' : str(meta['r']), 'set' : 'manage' }, callback=self.parse_showdesk_members2, meta=new_meta) member_nodes = hxs.xpath('//form[@id="delForm"]//table/tbody/tr') if member_nodes: for m_n in member_nodes: member_tds = m_n.xpath('td') info_query_str = None try: phone = member_tds[1].xpath('a/child::text()').extract()[0].replace(' ', '').strip() name = member_tds[2].xpath('span/child::text()').extract()[0].replace(' ', '').strip() card_no = member_tds[6].xpath('table/tr/td[1]/a/child::text()').extract()[0].replace(' ', '').strip() info_query_str = member_tds[6].xpath('table/tr/td[1]/a/@onclick').extract()[0] info_query_str = info_query_str[info_query_str.find('?') + 1:] info_query_str = info_query_str[:info_query_str.find("'")] card_name = member_tds[6].xpath('table/tr/td[2]/child::text()').extract()[0].replace(' ', '').strip() card_type = member_tds[6].xpath('table/tr/td[3]//child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() discont = member_tds[6].xpath('table/tr/td[4]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() timeout = member_tds[6].xpath('table/tr/td[9]/child::text()').extract()[0].replace(' ', '').replace(' ', '').strip() overage = str_list_strip_replace(member_tds[6].xpath('table/tr/td[7]//child::text()').extract(), [' ', ' ', '\t', '\n']) except: self.log(traceback.format_exc()) continue mem_item = SentreeMembersSimpleItem() mem_item[u'phone'] = phone mem_item[u'name'] = name mem_item[u'card_no'] = card_no mem_item[u'card_name'] = card_name mem_item[u'card_type'] = card_type mem_item[u'discont'] = discont mem_item[u'timeout'] = timeout mem_item[u'overage'] = overage if info_query_str: new_meta = dict(meta) new_meta['item'] = mem_item yield Request(url='http://vip6.sentree.com.cn/shair/memberArchives!editMember.action?%s%d' % (info_query_str, time.time()), callback=self.parse_member_overdraft, meta=new_meta) else: mem_item['overdraft'] = '0.0' yield mem_item
def parse_member_overdraft2(self, resp): mem_item = resp.meta['item'] hxs = Selector(resp) total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()') if not total_overdraft_nodes: overdraft = '0.0' else: overdrafts = str_list_strip_replace(total_overdraft_nodes.extract(), [' ', ' ', '\t', '\n']) overdraft_statuss = str_list_strip_replace(hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[5]/font/child::text()').extract(), [' ', ' ', '\t', '\n']) overdraft = float(0) for i, s_overdraft in enumerate(overdrafts): f_overdraft = float(s_overdraft) if u'已还清' in overdraft_statuss[i]: overdraft = overdraft - f_overdraft continue if u'未还清' in overdraft_statuss[i]: overdraft = overdraft + f_overdraft if overdraft < 0: overdraft = float(0) overdraft = '%.1f' % overdraft mem_item['overdraft'] = overdraft yield mem_item
def parse_showdesk_services(self, resp): hxs = Selector(resp) headers = hxs.xpath('//table[@id="itemset"]/thead/tr/th/child::text()').extract() if not headers: self.log('%s can not find table headers.' % self.name, level=log.ERROR) yield None return service_nodes = hxs.xpath('//table[@id="itemset"]/tbody/tr') if not service_nodes: self.log('%s can not find services info' % self.name, level=log.ERROR) yield None return for s_n in service_nodes: info_nodes = s_n.xpath('td') info = OrderedDict({}) no = None for idx, i_n in enumerate(info_nodes): if idx == 0 or idx == len(info_nodes) - 1: continue if idx == 8: info[headers[idx]] = str_list_strip_replace(str_list_strip(hxs.xpath('//span[@id="pricespan%s"]' % no).xpath('child::text()').extract()), [' ', '\t', '\n']) continue if idx == 9: discount_nodes = i_n.xpath('.//div[starts-with(@id, "icddiv")]') discounts = [] if discount_nodes: for d_n in discount_nodes: discounts.append(' | '.join(str_list_strip_replace(str_list_strip(d_n.xpath('./child::text()').extract()), [' ', '\t', '\n']))) info[headers[idx]] = ' ||| '.join(discounts) continue info[headers[idx]] = ' | '.join(str_list_strip_replace(str_list_strip(i_n.xpath('descendant::text()').extract()), [' ', '\t', '\n'])) if idx == 1: no = info[headers[idx]] item = SentreeServiceItem() item['info'] = info # items.append(info) yield item
f = open('e:\\1.html') html = "" for l in f: html += l f.close() resp = TextResponse(url="", body=html) if 1: hxs = Selector(resp) total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]') total_overdraft_nodes = hxs.xpath('//div[@class="table-responsive"]/table/tbody/tr/td[3]/child::text()') if not total_overdraft_nodes: overdraft = '0' else: overdraft = str_list_strip_replace(total_overdraft_nodes.extract(), [' ', ' ', '\t', '\n'])[0] print overdraft sys.exit(0) s = SentreeSpider() try: s.parse_showdesk_services(resp) except: print traceback.format_exc() print json.dumps(obj=items, ensure_ascii=False, indent=4) # sys.exit(0) # # SentreeSpider().parse_consumer_bill_stream_validate(resp) # # datas = []