def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%("f"+str(idx*32)) #url="http://gz.ganji.com/fang2/u2f0/a1f768/" # print url try: req=urllib2.Request(url, None, self.header) p=self.br.open(req).read() except: continue else: check=PyQuery(p)("ul.pageLink li a.c").text() if check==None or check==checkit: cond=False break else: checkit=check links=PyQuery(p)("div.list dl") p=None # print len(links) for link in links: lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href") # print lk if self.kind=="3" or self.kind=="4": tm=PyQuery(link)("dd span.time").text() if re.match('''\d{2}-\d{2}''', tm): Y=int(time.strftime('%Y', time.localtime())) tm="%s-%s"%(Y,tm.strip()) if tm<self.endtime: cond=False break elif "分钟" in tm: pass elif "小时" in tm: pass else: cond=False break if not checkPath(homepath,self.folder,lk): LinkLog.info("%s|%s"%(self.kind,lk)) try: getContent(lk,self.citycode,self.kind,self.upc) except Exception,e:print "ganji getContent Exception %s"%e # fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind}) # if lk not in self.clinks: # self.clinks.append(lk) idx=idx+1
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%("f"+str(idx*32)) #url="http://gz.ganji.com/fang2/u2f0/a1f768/" print url try: req=urllib2.Request(url, None, self.header) p=self.br.open(req).read() except: pass else: check=PyQuery(p)("ul.pageLink li a.c").text() if check==checkit: break else: checkit=check links=PyQuery(p)("div.list dl") print len(links) for link in links: lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href") if self.kind=="3" or self.kind=="4": tm=PyQuery(link)("dd span.time").text() if re.match('''\d{2}-\d{2}''', tm): Y=int(time.strftime('%Y', time.localtime())) tm="%s-%s"%(Y,tm.strip()) if tm<self.endtime: break elif "分钟" in tm: pass elif "小时" in tm: pass else: break if lk not in self.clinks: self.clinks.append(lk) idx=idx+1 time.sleep(self.st) print len(self.clinks)