class rebot_obj: def __init__(self): self.url = "http://top.chinaz.com/list.aspx?p=%d" self.curUrl = None self.pageNum = 1 self.curRequest = None self.urlList = [] def run(self): while self.pageNum < 300: self.curUrl = self.url % self.pageNum print self.curUrl self.curRequest = Requset(self.curUrl, 10) self.curRequest.run() doc = self.curRequest.get_doc() infoTag = doc.xpath("//div[@class='info']/h3/a") for info in infoTag: try: domainTitle = info.text url = info.get("href") url = url.replace("/site_", "http://").replace(".html", "/") domain = url_object(url).getRootDomain self.urlList.append((domain, domainTitle)) except Exception, e: print "parse_web.rebot_obj.run: %s" % e self.pageNum += 1
class rebot_obj(): def __init__(self): self.url = 'http://top.chinaz.com/list.aspx?p=%d' self.curUrl = None self.pageNum = 1 self.curRequest = None self.urlList = [] def run(self): while self.pageNum < 300: self.curUrl = self.url % self.pageNum print self.curUrl self.curRequest = Requset(self.curUrl, 10) self.curRequest.run() doc = self.curRequest.get_doc() infoTag = doc.xpath("//div[@class='info']/h3/a") for info in infoTag: try: domainTitle = info.text url = info.get('href') url = url.replace("/site_", "http://").replace(".html", "/") domain = url_object(url).getRootDomain self.urlList.append((domain, domainTitle)) except Exception, e: print 'parse_web.rebot_obj.run: %s' % e self.pageNum += 1
def get_script_content_in_js(self): ''' 描述: 从js文件中获取script内容 ''' scriptContentList = [] for js in self.get_script_tag_js_list: request = Requset(js, 1) request.run() req = request.get_text() if req is not None: scriptContentList.append(req) return scriptContentList
def get_style_content_in_css(self): ''' 描述: 从css样式中获取style内容 ''' styleContentList = [] for css in self.get_link_tag_css_list: request = Requset(css, 1) request.run() req = request.get_text() if req is not None: styleContentList.append(req) return styleContentList