def __getPageAllLink(self,p): # if self.kind=="1": # lis=PyQuery(p)("div.qiuzu li") # elif self.kind=="2": # lis=PyQuery(p)("div.qiuzu li") if self.kind=="1" or self.kind=="2": lis=PyQuery(p)("div.house") else: lis=PyQuery(p)("div.qiuzu li") links=[] for li in lis: # if self.kind=="3": # tm=PyQuery(li)("p.time span").eq(1).text() # link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") if self.kind=="2" or self.kind=="1": tm=PyQuery(li)("p.time").text() tm=tm and tm.replace("个人","") or "" link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") else: tm=PyQuery(li)("span.li5").text() link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") if self.kind=="4": if PyQuery(li)("span.li1").text()=="合租 ": continue # tm=PyQuery(li)("span.li5").text() # link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") # print link if u"天" in tm: s=tm.find(u"天") tm=tm[:s] if int(tm)<8: links.append(link) else: break elif u"小时" in tm: links.append(link) elif u"分钟" in tm: links.append(link) else: continue if 1:#not checkPath(homepath,self.folder,link): LinkLog.info("%s|%s"%(self.kind,link)) try: getContent(link,self.citycode,self.kind) except Exception,e:print "ganji getContent Exception %s"%e time.sleep(int(self.st)) # fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind}) # self.clinks.extend(links) if self.kind=="1" or self.kind=="2": if len(links)!=30: return False else: return True else: if len(links)!=35: return False else: return True
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%(str(idx+1)) # print url req=urllib2.Request(url, None, self.header) try: p=self.br.open(req).read() except: continue check=PyQuery(p)("div.pager strong span").text() if check ==None or check==checkit: cond=False break else: checkit=check if self.kind=="1" or self.kind=="3": links=PyQuery(p)("table.tbimg td.t") elif self.kind=="2" or self.kind=="4": links=PyQuery(p)("table.tblist tr") p=None # print len(links) for link in links: if self.kind=="1" or self.kind=="3": if re.search(ur'''更新时间:(.*)''',PyQuery(link).text()): tm=re.search(ur'''更新时间:(.*)''',PyQuery(link).text()).group(1) elif self.kind=="2"or self.kind=="4": tm=PyQuery(link)("td.tc").eq(2).text() if u"今天" in tm: pass elif u"小时" in tm: pass elif u"分钟" in tm: pass else: Y=int(time.strftime('%Y', time.localtime())) ttt="%s-%s"%(Y,tm) if ttt<self.endtime: cond=False break lk=PyQuery(link)("a.t").attr("href") # print lk if not checkPath(homepath,self.folder,lk): LinkLog.info("%s|%s"%(self.kind,lk)) try: getContent(lk,self.citycode,self.kind,self.upc) except Exception,e:print "58 getContent Exception %s"%e time.sleep(int(self.st))
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%("f"+str(idx*32)) #url="http://gz.ganji.com/fang2/u2f0/a1f768/" # print url try: req=urllib2.Request(url, None, self.header) p=self.br.open(req).read() except: continue else: check=PyQuery(p)("ul.pageLink li a.c").text() if check==None or check==checkit: cond=False break else: checkit=check links=PyQuery(p)("div.list dl") p=None # print len(links) for link in links: lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href") # print lk if self.kind=="3" or self.kind=="4": tm=PyQuery(link)("dd span.time").text() if re.match('''\d{2}-\d{2}''', tm): Y=int(time.strftime('%Y', time.localtime())) tm="%s-%s"%(Y,tm.strip()) if tm<self.endtime: cond=False break elif "分钟" in tm: pass elif "小时" in tm: pass else: cond=False break if not checkPath(homepath,self.folder,lk): LinkLog.info("%s|%s"%(self.kind,lk)) try: getContent(lk,self.citycode,self.kind,self.upc) except Exception,e:print "ganji getContent Exception %s"%e # fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind}) # if lk not in self.clinks: # self.clinks.append(lk) idx=idx+1