def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls self.fd["c"]="houseapi" self.fd["a"]="savehouse" self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: self.fd["posttime"] = 0 if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) self.fd['city'] = urlparse(self.urls)[1].replace('.58.com',"") makePath(homepath,self.folder,self.urls) #超过七天 # if self.fd["posttime"]: # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls if isDEV: # self.fd.update(getDefaultVal(4)) dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) for item in dfv.items() : print item[0],self.fd[item[0]],type(self.fd[item[0]]) return else: dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) try: if self.fd['city'] == 'su':self.fd['city'] = 'suzhou' except: self.fd['city'] = 'suzhou' self.fd["is_checked"] = 1 self.fd["web_flag"] = "58" if self.fd.get('is_ok')==False: # print "jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj" self.fd={} #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd)) p=self.br.open(req).read().strip() print p.decode('gbk')
def extractDict(self): if checkPath(homepath,self.folder,self.urls): pass else: try: if self.kind=="1": self.sell(self.urls) elif self.kind=="2": self.rent(self.urls) elif self.kind=="3": self.buy(self.urls) else: self.require(self.urls) makePath(homepath,self.folder,self.urls) #超过七天 # if (time.time() -self.fd["posttime"]) > 7*24*36000:return except Exception,e: self.fd['house_title']=None msglogger.info("%s 链接采集异常"%self.urls) # print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls if isDEV: # self.fd.update(getDefaultVal(4)) dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) for item in dfv.items() : print item[0],self.fd[item[0]],type(self.fd[item[0]]) return else: dfv=getDefaultVal(self.kind) for item in dfv.items() : # print item[0],item[1] if item[0] not in self.fd: self.fd[item[0]]=dfv.get(item[0]) self.fd["is_checked"] = 1 self.fd["web_flag"] = "gj" #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls) return self.fd if not self.fd["is_checked"]: for i in self.fd.items(): print i[0],i[1] print "*"*80
def getLinks(d): lc=LinkCrawl(d["citycode"],d["kind"],d["st1"]) while True: lc.runme() del gc.garbage[:] time.sleep(int(d["st2"])) def getContent(clinks,citycode,kind): cc=ContentCrawl(clinks,citycode,kind) fd=cc.extractDict() res="" try: res=postHost(fd) except Exception,e: res=e print res msglogger.info("%s|%s|%s"%(clinks,res,"")) return fd if __name__=="__main__": lc=LinkCrawl(citycode="wuxi",kind="4") lc.runme() #cc=ContentCrawl("http://esf.wuxi.soufun.com/chushou/1_119888237_-1.htm#p=1",citycode="wuxi",kind="1") #cc=ContentCrawl("http://rent.wuxi.soufun.com/chuzu/1_49544277_-1.htm",citycode="wuxi",kind="2") cc=ContentCrawl("http://esf.wuxi.soufun.com/qiugou/1_860333_-1.htm",citycode="wuxi",kind="3") #cc=ContentCrawl("http://rent.wuxi.soufun.com/qiuzu/1_55103674_-1.htm",citycode="wuxi",kind="4") cc.extractDict() # lf=file("link.log")
lc=LinkCrawl(d["citycode"],d["kind"],d["st1"]) while True: lc.runme() time.sleep(int(d["st2"])) del gc.garbage[:] def getContent(clinks,citycode,kind,upc): # return cc=ContentCrawl(clinks,citycode,kind,upc) fd=cc.extractDict() res="" try: res=postHost(fd) except Exception,e: res=e print res msglogger.info("%s|%s|%s"%(clinks,res,kind)) del gc.garbage[:] if __name__=="__main__": # lc=LinkCrawl(citycode="liaoyang",kind="4") # lc.runme() # url1 = "http://su.58.com/ershoufang/6432469244037x.shtml" # url2 = "http://su.58.com/zufang/6437371140226x.shtml" # url3 = "http://su.58.com/ershoufang/6383611408516x.shtml" # url4 = "http://su.58.com/qiuzu/6268009935368x.shtml" # cc=ContentCrawl([url4],citycode="su",kind="4") # cc.extractDict() import gc gc.enable() s=getLinksThread({"citycode":"liaoyang","kind":"3"}) s.start()
def main(): for city in citylist: startCityThread(city).start() msglogger.info("%s 线程启动"%city)