def extractDict(self): for url in self.urls: if checkPath(homepath,self.folder,url): pass else: self.fd["posttime"] = 0 if self.kind=="1": self.ChuShou(url) elif self.kind=="2": self.ChuZu(url) elif self.kind=="3": self.QiuGou(url) else: self.QiuZu(url) self.fd['city'] = urlparse(url)[1].replace('.58.com',"") #makePath(homepath,self.folder,url) #超过七天 if (time.time() -self.fd["posttime"]) > 7*24*36000:return self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 0 self.fd["web_flag"] = "58" if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1]
def extractDict(self): for url in self.urls: if checkPath(homepath,self.folder,url): pass else: try: self.fd["posttime"] = 0 if self.kind=="1": self.sell(url) elif self.kind=="2": self.rent(url) elif self.kind=="3": self.buy(url) else: self.require(url) self.fd['city'] = urlparse(url)[1].replace('.58.com',"") makePath(homepath,self.folder,url) #超过七天 if (time.time() -self.fd["posttime"]) > 7*24*36000:return except:pass if self.fd['city'] == 'su':self.fd['city'] = 'suzhou' self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "58" if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd)) p=self.br.open(req).read().strip() print p.decode('gbk')
def extractDict(self): for url in self.urls: if checkPath(homepath,self.folder,url): pass else: try: if self.kind=="1": self.sell(url) elif self.kind=="2": self.rent(url) elif self.kind=="3": self.buy(url) else: self.require(url) makePath(homepath,self.folder,url) #超过七天 if (time.time() -self.fd["posttime"]) > 7*24*36000:return except:pass self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80 if len(self.fd)==7 or len(self.fd)==17: print "#####################################" continue req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd)) p=self.br.open(req).read().strip() print p.decode('gbk') print "*"*80
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 if (time.time() -self.fd["posttime"]) > 7*24*36000:return except:pass self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: self.fd["posttime"] = 0 if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) self.fd['city'] = urlparse(self.urls)[1].replace('.58.com',"") makePath(homepath,self.folder,self.urls) #超过七天 # if self.fd["posttime"]: # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls if isDEV: # self.fd.update(getDefaultVal(4)) dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) for item in dfv.items() : print item[0],self.fd[item[0]],type(self.fd[item[0]]) return else: dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) try: if self.fd['city'] == 'su':self.fd['city'] = 'suzhou' except: self.fd['city'] = 'suzhou' self.fd["is_checked"] = 1 self.fd["web_flag"] = "58" if self.fd.get('is_ok')==False: # print "jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj" self.fd={} #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd)) p=self.br.open(req).read().strip() print p.decode('gbk')
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%(str(idx+1)) # print url req=urllib2.Request(url, None, self.header) try: p=self.br.open(req).read() except: continue check=PyQuery(p)("div.pager strong span").text() if check ==None or check==checkit: cond=False break else: checkit=check if self.kind=="1" or self.kind=="3": links=PyQuery(p)("table.tbimg td.t") elif self.kind=="2" or self.kind=="4": links=PyQuery(p)("table.tblist tr") p=None # print len(links) for link in links: if self.kind=="1" or self.kind=="3": if re.search(ur'''更新时间:(.*)''',PyQuery(link).text()): tm=re.search(ur'''更新时间:(.*)''',PyQuery(link).text()).group(1) elif self.kind=="2"or self.kind=="4": tm=PyQuery(link)("td.tc").eq(2).text() if u"今天" in tm: pass elif u"小时" in tm: pass elif u"分钟" in tm: pass else: Y=int(time.strftime('%Y', time.localtime())) ttt="%s-%s"%(Y,tm) if ttt<self.endtime: cond=False break lk=PyQuery(link)("a.t").attr("href") # print lk if not checkPath(homepath,self.folder,lk): LinkLog.info("%s|%s"%(self.kind,lk)) try: getContent(lk,self.citycode,self.kind,self.upc) except Exception,e:print "58 getContent Exception %s"%e time.sleep(int(self.st))
def __getAllNeedLinks(self): cond=True idx=0 checkit="0" while cond: url=self.baseUrl+self.urlpath%("f"+str(idx*32)) #url="http://gz.ganji.com/fang2/u2f0/a1f768/" # print url try: req=urllib2.Request(url, None, self.header) p=self.br.open(req).read() except: continue else: check=PyQuery(p)("ul.pageLink li a.c").text() if check==None or check==checkit: cond=False break else: checkit=check links=PyQuery(p)("div.list dl") p=None # print len(links) for link in links: lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href") # print lk if self.kind=="3" or self.kind=="4": tm=PyQuery(link)("dd span.time").text() if re.match('''\d{2}-\d{2}''', tm): Y=int(time.strftime('%Y', time.localtime())) tm="%s-%s"%(Y,tm.strip()) if tm<self.endtime: cond=False break elif "分钟" in tm: pass elif "小时" in tm: pass else: cond=False break if not checkPath(homepath,self.folder,lk): LinkLog.info("%s|%s"%(self.kind,lk)) try: getContent(lk,self.citycode,self.kind,self.upc) except Exception,e:print "ganji getContent Exception %s"%e # fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind}) # if lk not in self.clinks: # self.clinks.append(lk) idx=idx+1
def extractDict(self): self.fd["citycode"]=self.citycode for url in self.urls: if checkPath(homepath,self.folder,url): continue req=urllib2.Request(url, None, self.header) page=self.br.open(req).read() if re.search(self.ht_r, page): if "商铺"==re.search(self.ht_r, page).group(1): continue else: ht=housetype(re.search(self.ht_r, page).group(1)) self.fd["house_type"]=ht #lambda a: a and self.fd["borough_section"]=a.group(1) or self.fd["borough_section"]="" self.fd["borough_section"]=re.search(self.ad_r, page)!=None and re.search(self.ad_r, page).group(1) or "" self.fd["cityarea"]=re.search(self.ca_r, page)!=None and re.search(self.ca_r, page).group(1) or "" self.fd["house_fitment"]=re.search(self.fm_r, page)!=None and re.search(self.fm_r, page).group(1) or "" self.fd["house_kind"]=self.kind self.fd["belong"]=re.search(self.bl_r, page)!=None and re.search(self.bl_r, page).group(1) or "" self.fd["house_price"]=re.search(self.hp_r, page)!=None and re.search(self.hp_r, page).group(1) or "" self.fd["house_totalarea"]=re.search(self.hta_r, page)!=None and re.search(self.hta_r, page).group(1) or "" house_type=re.search(self.hrht_r, page)!=None and re.search(self.hrht_r, page).group(1) or "" blank=0 if house_type.find("室")!= -1: self.fd["house_room"]=house_type[blank:house_type.find("室")] blank=house_type.find("室")+3 else: self.fd["house_room"]="" if house_type.find("厅")!=-1: self.fd["house_hall"]=house_type[blank:house_type.find("厅")] blank=house_type.find("厅")+3 else: self.fd["house_hall"]="" if house_type.find("卫")!=-1: self.fd["house_toilet"]=house_type[blank:house_type.find("卫")] else: self.fd["house_toilet"]="" self.fd["house_floor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(1) or "" self.fd["house_topfloor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(2) or "" self.fd["house_age"]=re.search(self.ha_r, page)!=None and re.search(self.ha_r, page).group(1) or "" self.fd["house_sup"]=re.search(self.hs_r, page)!=None and re.search(self.hs_r, page).group(1) or "" self.fd["house_desc"]=re.search(self.hd_r, page)!=None and re.search(self.hd_r, page).group(1) or "" self.fd["borough_name"]=re.search(self.nm_r, page)!=None and re.search(self.nm_r, page).group(1) or "" makePath(homepath,self.folder,url) for ddd in self.fd.items(): print ddd[0],ddd[1] print "="*60
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: self.fd['house_title']=None msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls if isDEV: # self.fd.update(getDefaultVal(4)) dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) for item in dfv.items() : print item[0],self.fd[item[0]],type(self.fd[item[0]]) return else: dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80