def extractDict(self): if 0:#checkPath(homepath,self.folder,self.urls): return None else: self.fd["citycode"] = self.citycode self.fd["cityname"] = citynameDict_sf[self.citycode] self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "sf" self.fd["is_ok"] = True print self.urls try: if self.kind=="3": self.buy(self.urls) self.fd["house_flag"] = 3 elif self.kind=="4": self.require(self.urls) self.fd["house_flag"] = 4 elif self.kind=="2": self.rent(self.urls) self.fd["house_flag"] = 2 elif self.kind=="1": self.sell(self.urls) self.fd["house_flag"] = 1 makePath(homepath,self.folder,self.urls) except Exception,e: print e pass else:
def extractDict(self): for url in self.urls: if checkPath(homepath,self.folder,url): pass else: try: self.fd["posttime"] = 0 if self.kind=="1": self.sell(url) elif self.kind=="2": self.rent(url) elif self.kind=="3": self.buy(url) else: self.require(url) self.fd['city'] = urlparse(url)[1].replace('.58.com',"") makePath(homepath,self.folder,url) #超过七天 if (time.time() -self.fd["posttime"]) > 7*24*36000:return except:pass if self.fd['city'] == 'su':self.fd['city'] = 'suzhou' self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "58" if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd)) p=self.br.open(req).read().strip() print p.decode('gbk')
def extractDict(self): for url in self.urls: if checkPath(homepath,self.folder,url): pass else: try: if self.kind=="1": self.sell(url) elif self.kind=="2": self.rent(url) elif self.kind=="3": self.buy(url) else: self.require(url) makePath(homepath,self.folder,url) #超过七天 if (time.time() -self.fd["posttime"]) > 7*24*36000:return except:pass self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80 if len(self.fd)==7 or len(self.fd)==17: print "#####################################" continue req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd)) p=self.br.open(req).read().strip() print p.decode('gbk') print "*"*80
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 if (time.time() -self.fd["posttime"]) > 7*24*36000:return except:pass self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: self.fd["posttime"] = 0 if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) self.fd['city'] = urlparse(self.urls)[1].replace('.58.com',"") makePath(homepath,self.folder,self.urls) #超过七天 # if self.fd["posttime"]: # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls if isDEV: # self.fd.update(getDefaultVal(4)) dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) for item in dfv.items() : print item[0],self.fd[item[0]],type(self.fd[item[0]]) return else: dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) try: if self.fd['city'] == 'su':self.fd['city'] = 'suzhou' except: self.fd['city'] = 'suzhou' self.fd["is_checked"] = 1 self.fd["web_flag"] = "58" if self.fd.get('is_ok')==False: # print "jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj" self.fd={} #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd)) p=self.br.open(req).read().strip() print p.decode('gbk')
def extractDict(self): self.fd["citycode"]=self.citycode for url in self.urls: if checkPath(homepath,self.folder,url): continue req=urllib2.Request(url, None, self.header) page=self.br.open(req).read() if re.search(self.ht_r, page): if "商铺"==re.search(self.ht_r, page).group(1): continue else: ht=housetype(re.search(self.ht_r, page).group(1)) self.fd["house_type"]=ht #lambda a: a and self.fd["borough_section"]=a.group(1) or self.fd["borough_section"]="" self.fd["borough_section"]=re.search(self.ad_r, page)!=None and re.search(self.ad_r, page).group(1) or "" self.fd["cityarea"]=re.search(self.ca_r, page)!=None and re.search(self.ca_r, page).group(1) or "" self.fd["house_fitment"]=re.search(self.fm_r, page)!=None and re.search(self.fm_r, page).group(1) or "" self.fd["house_kind"]=self.kind self.fd["belong"]=re.search(self.bl_r, page)!=None and re.search(self.bl_r, page).group(1) or "" self.fd["house_price"]=re.search(self.hp_r, page)!=None and re.search(self.hp_r, page).group(1) or "" self.fd["house_totalarea"]=re.search(self.hta_r, page)!=None and re.search(self.hta_r, page).group(1) or "" house_type=re.search(self.hrht_r, page)!=None and re.search(self.hrht_r, page).group(1) or "" blank=0 if house_type.find("室")!= -1: self.fd["house_room"]=house_type[blank:house_type.find("室")] blank=house_type.find("室")+3 else: self.fd["house_room"]="" if house_type.find("厅")!=-1: self.fd["house_hall"]=house_type[blank:house_type.find("厅")] blank=house_type.find("厅")+3 else: self.fd["house_hall"]="" if house_type.find("卫")!=-1: self.fd["house_toilet"]=house_type[blank:house_type.find("卫")] else: self.fd["house_toilet"]="" self.fd["house_floor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(1) or "" self.fd["house_topfloor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(2) or "" self.fd["house_age"]=re.search(self.ha_r, page)!=None and re.search(self.ha_r, page).group(1) or "" self.fd["house_sup"]=re.search(self.hs_r, page)!=None and re.search(self.hs_r, page).group(1) or "" self.fd["house_desc"]=re.search(self.hd_r, page)!=None and re.search(self.hd_r, page).group(1) or "" self.fd["borough_name"]=re.search(self.nm_r, page)!=None and re.search(self.nm_r, page).group(1) or "" makePath(homepath,self.folder,url) for ddd in self.fd.items(): print ddd[0],ddd[1] print "="*60
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: self.fd['house_title']=None msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls if isDEV: # self.fd.update(getDefaultVal(4)) dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) for item in dfv.items() : print item[0],self.fd[item[0]],type(self.fd[item[0]]) return else: dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80