def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%(str(idx+1)) print url req=urllib2.Request(url, None, self.header) try: p=self.br.open(req).read() except: continue check=PyQuery(p)("div.pager strong span").text() if check ==None or check==checkit: cond=False break else: checkit=check if self.kind=="1" or self.kind=="3": links=PyQuery(p)("table.tbimg td.t") elif self.kind=="2" or self.kind=="4": links=PyQuery(p)("table.tblist tr") p=None print len(links) for link in links: if self.kind=="1" or self.kind=="3": if re.search(ur'''更新时间:(.*)''',PyQuery(link).text()): tm=re.search(ur'''更新时间:(.*)''',PyQuery(link).text()).group(1) elif self.kind=="2"or self.kind=="4": tm=PyQuery(link)("td.tc").eq(2).text() if u"今天" in tm: pass elif u"小时" in tm: pass elif u"分钟" in tm: pass else: Y=int(time.strftime('%Y', time.localtime())) ttt="%s-%s"%(Y,tm) if ttt<self.endtime: cond=False break lk=PyQuery(link)("a.t").attr("href") # print lk if not checkPath(homepath,self.folder,lk): LinkLog.info("%s|%s"%(self.kind,lk)) fetch_quere.put({"mod":"tongcheng58","link":lk,"citycode":self.citycode,"kind":self.kind})
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%("f"+str(idx*32)) #url="http://gz.ganji.com/fang2/u2f0/a1f768/" print url try: req=urllib2.Request(url, None, self.header) p=self.br.open(req).read() except: continue else: check=PyQuery(p)("ul.pageLink li a.c").text() if check==None or check==checkit: cond=False break else: checkit=check links=PyQuery(p)("div.list dl") p=None print len(links) for link in links: lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href") if self.kind=="3" or self.kind=="4": tm=PyQuery(link)("dd span.time").text() if re.match('''\d{2}-\d{2}''', tm): Y=int(time.strftime('%Y', time.localtime())) tm="%s-%s"%(Y,tm.strip()) if tm<self.endtime: break elif "分钟" in tm: pass elif "小时" in tm: pass else: cond=False break if not checkPath(homepath,self.folder,lk): fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind}) # if lk not in self.clinks: # self.clinks.append(lk) idx=idx+1 print len(self.clinks)