def __init__(self, main_url, url="", obj_name=None, limit=0, html=True, floor=0): self.limit = limit self.counter = 0 self.html = html self.floor = floor self.nowurl = url or main_url self.starturl = url or main_url self.page = 1 # 天涯目前贴子分两类,处理规则各不相同 # techforum、publicforum self.thread_type = self.starturl.split("/")[3] content = reconnecting_urlopen(main_url, retry=100).decode("gbk", "ignore") log.info("content %s %s %s" % (len(content), type(content), content[:100])) soup = BeautifulSoup(content) # 处理obj_name self.obj_name = obj_name if not self.obj_name: # 获得楼主昵称 if self.thread_type == "techforum": self.obj_name = soup.find("div", {"class": "vcard"}).find("a", target="_blank").renderContents() else: self.obj_name = self.get_firstauthor(soup)
def __init__(self, main_url, url="", obj_name=None, limit=0, html=True, floor=0): self.limit = limit self.counter = 0 self.html = html self.floor = floor self.nowurl = url or main_url self.starturl = url or main_url self.page = 1 # 天涯目前贴子分两类,处理规则各不相同 # techforum、publicforum self.thread_type = self.starturl.split("/")[3] content = reconnecting_urlopen(main_url, retry=100).decode("gbk", "ignore") log.info("content %s %s %s" % (len(content), type(content), content[:100])) soup = BeautifulSoup(content) # 处理obj_name self.obj_name = obj_name if not self.obj_name: # 获得楼主昵称 if self.thread_type == "techforum": self.obj_name = soup.find("div", { "class": "vcard" }).find("a", target="_blank").renderContents() else: self.obj_name = self.get_firstauthor(soup)
def __init__(self, main_url, url="", obj_name=None, limit=0, html=False, floor=0): self.limit = limit self.counter = 0 self.html = html self.floor = floor self.nowurl = url or main_url self.starturl = url or main_url # 处理obj_name self.obj_name = obj_name if not self.obj_name: # 获得楼主昵称 content = reconnecting_urlopen(main_url, retry=100).decode( self.ENCODING, "ignore") log.info("content %s %s %s" % (len(content), type(content), content[:100])) soup = BeautifulSoup(content) first = soup.find("div", {"class": "topic-doc"}) self.obj_name = self.get_username(first)
def next(self): if (self.limit and self.counter == self.limit) or (self.nowurl is None): raise StopIteration result = {} result["url"] = self.nowurl result["content"] = [] try: content = reconnecting_urlopen(self.nowurl).decode("gbk", "ignore") except Exception, e: raise StopIteration
def next(self): if (self.limit and self.counter == self.limit) or (self.nowurl is None): raise StopIteration self.page = self.get_page(self.nowurl) result = {} result["url"] = self.nowurl result["page"] = self.page result["content"] = [] floor = (self.page - 1) * 100 + 1 try: content = reconnecting_urlopen(self.nowurl).decode(self.ENCODING, "ignore") soup = BeautifulSoup(content) except Exception, e: raise StopIteration
def __init__(self, main_url, url="", obj_name=None, limit=0, html=False, floor=0): self.limit = limit self.counter = 0 self.html = html self.floor = floor self.nowurl = url or main_url self.starturl = url or main_url # 处理obj_name self.obj_name = obj_name if not self.obj_name: # 获得楼主昵称 content = reconnecting_urlopen(main_url, retry=100).decode(self.ENCODING, "ignore") log.info("content %s %s %s" % (len(content), type(content), content[:100])) soup = BeautifulSoup(content) first = soup.find("div", {"class": "topic-doc"}) self.obj_name = self.get_username(first)
def next(self): if (self.limit and self.counter == self.limit) or (self.nowurl is None): raise StopIteration self.page = self.get_page(self.nowurl) result = {} result["url"] = self.nowurl result["page"] = self.page result["content"] = [] floor = (self.page - 1) * 100 + 1 try: content = reconnecting_urlopen(self.nowurl).decode( self.ENCODING, "ignore") soup = BeautifulSoup(content) except Exception, e: raise StopIteration
def __init__(self, main_url, url="", obj_name=None, limit=0, html=False, floor=0): self.limit = limit self.counter = 0 self.html = html self.floor = floor self.nowurl = url or main_url self.starturl = url or main_url self.is_lzl = False # 处理页数 # self.page = 1 # if PN_RE.search(self.starturl): # self.page = int(PN_RE.search(self.starturl).group(1)) / 30 + 1 # 处理obj_name self.obj_name = obj_name content = reconnecting_urlopen(main_url, retry=100).decode("gbk", "ignore") soup = BeautifulSoup(content) log.info("content %s %s %s" % (len(content), type(content), content[:100])) # 查看是不是新版本,楼中楼(2011.09.26) is_lzl_re = IS_LZL_RE.search(content) if is_lzl_re: self.is_lzl = bool(int(is_lzl_re.group(1))) if not self.obj_name: # 获得楼主昵称 # self.obj_name = self.get_username(soup.find("div", {"class": "post"})) self.obj_name = self.get_username( soup.find("div", {"class": "p_post"}))