def getPageTotal(): pattern = u"//td[@valign='middle']/text()" url = SITE + "/kjxx/ssq/kjgg/index.shtml" page = spider.getPage(url) node = spider.getNodes(page, pattern) s = ''.join(node[-1]).encode('utf-8').strip() # 如:/25页 return int(s[1:len(s)-7]) # 最后一个“页”字占七个字节
def getPageTotal(): pattern = u"//td[@valign='middle']/text()" url = SITE + "/kjxx/ssq/kjgg/index.shtml" page = spider.getPage(url) node = spider.getNodes(page, pattern) s = ''.join(node[-1]).encode('utf-8').strip() # 如:/25页 return int(s[1:len(s) - 7]) # 最后一个“页”字占七个字节
def getIndexSSQ(index): result = [] url = "" if index-1 == 0: url = SITE + "/kjxx/ssq/kjgg/index.shtml" else: url = SITE + "/kjxx/ssq/kjgg/index_%d.shtml" % (index - 1) indexPage = spider.getPage(url) nodes = spider.getNodes(indexPage, PATTERN_HERF) for node in nodes: ssqUrl = SITE + node.attrib['href'][8:] itemPage = spider.getPage(ssqUrl) ssqNodes = spider.getNodes(itemPage, PATTERN_NODE) temp = [x.text for x in ssqNodes] # 获取期数信息 nos = _getTime(itemPage) # 将期数等信息组合到结果中去 itemResult = {'red':temp[:-1],'blue':temp[-1:]} itemResult.update(nos) result.append(itemResult) return result
def getIndexSSQ(index): result = [] url = "" if index - 1 == 0: url = SITE + "/kjxx/ssq/kjgg/index.shtml" else: url = SITE + "/kjxx/ssq/kjgg/index_%d.shtml" % (index - 1) indexPage = spider.getPage(url) nodes = spider.getNodes(indexPage, PATTERN_HERF) for node in nodes: ssqUrl = SITE + node.attrib['href'][8:] itemPage = spider.getPage(ssqUrl) ssqNodes = spider.getNodes(itemPage, PATTERN_NODE) temp = [x.text for x in ssqNodes] # 获取期数信息 nos = _getTime(itemPage) # 将期数等信息组合到结果中去 itemResult = {'red': temp[:-1], 'blue': temp[-1:]} itemResult.update(nos) result.append(itemResult) return result
def getPage(url): return spider.getPage(url, 'gb2312')