Beispiel #1
0
 def extractDict(self):    
     for url in self.urls:
         if checkPath(homepath,self.folder,url):
             pass
         else:
             self.fd["posttime"] = 0
             if self.kind=="1":
                 self.ChuShou(url)
             elif self.kind=="2":
                 self.ChuZu(url)
             elif self.kind=="3":
                 self.QiuGou(url)
             else:
                 self.QiuZu(url)
             self.fd['city'] = urlparse(url)[1].replace('.58.com',"") 
             #makePath(homepath,self.folder,url)                
             #超过七天
             if (time.time() -self.fd["posttime"]) > 7*24*36000:return
             self.fd["c"]="houseapi"
             self.fd["a"]="savehouse"        
             self.fd["is_checked"] = 0        
             self.fd["web_flag"]   = "58"
             
             if not self.fd["is_checked"]:
                 for i in self.fd.items():
                     print i[0],i[1]
Beispiel #2
0
 def extractDict(self):    
     for url in self.urls:
         if checkPath(homepath,self.folder,url):
             pass
         else:
             try:
                 self.fd["posttime"] = 0
                 if self.kind=="1":
                     self.sell(url)
                 elif self.kind=="2":
                     self.rent(url)
                 elif self.kind=="3":
                     self.buy(url)
                 else:
                     self.require(url)
                 self.fd['city'] = urlparse(url)[1].replace('.58.com',"") 
                 makePath(homepath,self.folder,url)                
                 #超过七天
                 if (time.time() -self.fd["posttime"]) > 7*24*36000:return
             except:pass
             if self.fd['city'] == 'su':self.fd['city'] = 'suzhou'
             self.fd["c"]="houseapi"
             self.fd["a"]="savehouse"        
             self.fd["is_checked"] = 1      
             self.fd["web_flag"]   = "58"
             
             if not self.fd["is_checked"]:
                 for i in self.fd.items():
                     print i[0],i[1]                        
             req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd))
             p=self.br.open(req).read().strip()
             print p.decode('gbk')
Beispiel #3
0
 def extractDict(self):        
     for url in self.urls:
         if checkPath(homepath,self.folder,url):
             pass
         else:
             try:
                 if self.kind=="1":
                     self.sell(url)
                 elif self.kind=="2":
                     self.rent(url)
                 elif self.kind=="3":
                     self.buy(url)
                 else:
                     self.require(url)
                 makePath(homepath,self.folder,url)                
                 #超过七天
             
                 if (time.time() -self.fd["posttime"]) > 7*24*36000:return
             except:pass
             self.fd["c"]="houseapi"
             self.fd["a"]="savehouse"        
             self.fd["is_checked"] = 1        
             self.fd["web_flag"]   = "gj"
             
             if not self.fd["is_checked"]:
                 for i in self.fd.items():
                     print i[0],i[1]
             print  "*"*80
             if len(self.fd)==7 or len(self.fd)==17:
                 print "#####################################"
                 continue
             req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd))
             p=self.br.open(req).read().strip()
             print p.decode('gbk')
             print  "*"*80
Beispiel #4
0
    def extractDict(self):        
        if checkPath(homepath,self.folder,self.urls):
            pass
        else:
            try:
                if self.kind=="1":
                    self.sell(self.urls)
                elif self.kind=="2":
                    self.rent(self.urls)
                elif self.kind=="3":
                    self.buy(self.urls)
                else:
                    self.require(self.urls)
                makePath(homepath,self.folder,self.urls)                
                #超过七天
                
#                if (time.time() -self.fd["posttime"]) > 7*24*36000:return
            except Exception,e:
                msglogger.info("%s 链接采集异常"%self.urls)
#                print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls
            self.fd["c"]="houseapi"
            self.fd["a"]="savehouse"        
            self.fd["is_checked"] = 1        
            self.fd["web_flag"]   = "gj"
            print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls)
            return self.fd
        
            if not self.fd["is_checked"]:
                for i in self.fd.items():
                    print i[0],i[1]
            print  "*"*80
Beispiel #5
0
 def extractDict(self):        
     if checkPath(homepath,self.folder,self.urls):
         pass
     else:
         try:
             if self.kind=="1":
                 self.sell(self.urls)
             elif self.kind=="2":
                 self.rent(self.urls)
             elif self.kind=="3":
                 self.buy(self.urls)
             else:
                 self.require(self.urls)
             makePath(homepath,self.folder,self.urls)                
             #超过七天
         
             if (time.time() -self.fd["posttime"]) > 7*24*36000:return
         except:pass
         self.fd["c"]="houseapi"
         self.fd["a"]="savehouse"        
         self.fd["is_checked"] = 1        
         self.fd["web_flag"]   = "gj"
         return self.fd
     
         if not self.fd["is_checked"]:
             for i in self.fd.items():
                 print i[0],i[1]
         print  "*"*80
Beispiel #6
0
    def extractDict(self):    
        if checkPath(homepath,self.folder,self.urls):
            pass
        else:
            try:
                self.fd["posttime"] = 0
                if self.kind=="1":
                    self.sell(self.urls)
                elif self.kind=="2":
                    self.rent(self.urls)
                elif self.kind=="3":
                    self.buy(self.urls)
                else:
                    self.require(self.urls)
                self.fd['city'] = urlparse(self.urls)[1].replace('.58.com',"") 
                makePath(homepath,self.folder,self.urls)                
                #超过七天
#                if self.fd["posttime"]:
#                    if (time.time() -self.fd["posttime"]) > 7*24*36000:return
            except Exception,e:
                msglogger.info("%s 链接采集异常"%self.urls)
#                print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls

            if isDEV:
#                self.fd.update(getDefaultVal(4))
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
                for item in dfv.items() :
                    print item[0],self.fd[item[0]],type(self.fd[item[0]])
                    
                return
            else:
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
            try:
                if self.fd['city'] == 'su':self.fd['city'] = 'suzhou'
            except:
                self.fd['city'] = 'suzhou'
            self.fd["is_checked"] = 1      
            self.fd["web_flag"]   = "58"
            if self.fd.get('is_ok')==False:
#                print "jjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjjj"
                self.fd={}
            #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls)
            return self.fd
        
            if not self.fd["is_checked"]:
                for i in self.fd.items():
                    print i[0],i[1]                        
            req=urllib2.Request("http://site.jjr360.com/app.php", urllib.urlencode(self.fd))
            p=self.br.open(req).read().strip()
            print p.decode('gbk')
Beispiel #7
0
    def __getAllNeedLinks(self):
        cond=True
        idx=0
        checkit="0"
        while  cond:
            url=self.baseUrl+self.urlpath%(str(idx+1))
#            print url
            req=urllib2.Request(url, None, self.header)
            try:
                p=self.br.open(req).read()
            except:
                continue
            check=PyQuery(p)("div.pager strong span").text()
            if check ==None or check==checkit:
                cond=False
                break
            else:
                checkit=check
                if self.kind=="1" or self.kind=="3":
                    links=PyQuery(p)("table.tbimg td.t")
                elif self.kind=="2" or self.kind=="4":
                    links=PyQuery(p)("table.tblist tr")
                p=None
#                print len(links)
                for link in links:
                    if self.kind=="1" or self.kind=="3":
                        if re.search(ur'''更新时间:(.*)''',PyQuery(link).text()):
                            tm=re.search(ur'''更新时间:(.*)''',PyQuery(link).text()).group(1)
                    elif self.kind=="2"or self.kind=="4":
                        tm=PyQuery(link)("td.tc").eq(2).text()
                    if u"今天" in tm:
                        pass
                    elif u"小时" in tm:
                        pass
                    elif u"分钟" in tm:
                        pass
                    else:
                        Y=int(time.strftime('%Y', time.localtime()))
                        ttt="%s-%s"%(Y,tm)
                        if ttt<self.endtime:
                            cond=False
                            break
                    lk=PyQuery(link)("a.t").attr("href")
                    
#                    print lk
                    if not checkPath(homepath,self.folder,lk):
                        LinkLog.info("%s|%s"%(self.kind,lk))
                        try:
                            getContent(lk,self.citycode,self.kind,self.upc)
                        except Exception,e:print "58 getContent Exception %s"%e
                    time.sleep(int(self.st))
Beispiel #8
0
    def __getAllNeedLinks(self):
        cond=True
        idx=0
        checkit="0"
        while  cond:
            url=self.baseUrl+self.urlpath%("f"+str(idx*32))
            #url="http://gz.ganji.com/fang2/u2f0/a1f768/"
#            print url
            try:
                req=urllib2.Request(url, None, self.header)
                p=self.br.open(req).read()
            except:
                continue
            else:
                check=PyQuery(p)("ul.pageLink li a.c").text()
                if check==None or check==checkit:
                    cond=False
                    break
                else:
                    checkit=check
                    links=PyQuery(p)("div.list dl")
                    p=None
#                    print len(links)
                    for link in links:
                        lk=self.baseUrl+PyQuery(link)(" a.list_title").attr("href")
#                        print lk
                        if self.kind=="3" or self.kind=="4":
                            tm=PyQuery(link)("dd span.time").text()
                            if re.match('''\d{2}-\d{2}''', tm):
                                Y=int(time.strftime('%Y', time.localtime()))
                                tm="%s-%s"%(Y,tm.strip())
                                if tm<self.endtime:
                                    cond=False
                                    break
                            elif "分钟" in tm:
                                pass
                            elif "小时" in tm:
                                pass
                            else:
                                cond=False
                                break
                        if not checkPath(homepath,self.folder,lk):
                            LinkLog.info("%s|%s"%(self.kind,lk))
                            try:
                                getContent(lk,self.citycode,self.kind,self.upc)
                            except Exception,e:print "ganji getContent Exception %s"%e
#                            fetch_quere.put({"mod":"ganji","link":lk,"citycode":self.citycode,"kind":self.kind})        
#                        if lk not in self.clinks:
#                            self.clinks.append(lk)
                idx=idx+1
Beispiel #9
0
 def extractDict(self):
     self.fd["citycode"]=self.citycode
     for url in self.urls:
         if checkPath(homepath,self.folder,url):
             continue
         req=urllib2.Request(url, None, self.header)
         page=self.br.open(req).read()
         if re.search(self.ht_r, page):
             if "商铺"==re.search(self.ht_r, page).group(1):
                 continue
             else:
                 ht=housetype(re.search(self.ht_r, page).group(1))
                 self.fd["house_type"]=ht
                 #lambda a: a and self.fd["borough_section"]=a.group(1) or self.fd["borough_section"]=""
                 self.fd["borough_section"]=re.search(self.ad_r, page)!=None and re.search(self.ad_r, page).group(1) or ""
                 self.fd["cityarea"]=re.search(self.ca_r, page)!=None and re.search(self.ca_r, page).group(1) or ""
                 self.fd["house_fitment"]=re.search(self.fm_r, page)!=None and re.search(self.fm_r, page).group(1) or ""
                 self.fd["house_kind"]=self.kind
                 self.fd["belong"]=re.search(self.bl_r, page)!=None and re.search(self.bl_r, page).group(1) or ""
                 self.fd["house_price"]=re.search(self.hp_r, page)!=None and re.search(self.hp_r, page).group(1) or ""
                 self.fd["house_totalarea"]=re.search(self.hta_r, page)!=None and re.search(self.hta_r, page).group(1) or ""
                 house_type=re.search(self.hrht_r, page)!=None and re.search(self.hrht_r, page).group(1) or ""
                 blank=0
                 if house_type.find("室")!= -1:
                     self.fd["house_room"]=house_type[blank:house_type.find("室")]
                     blank=house_type.find("室")+3
                 else:
                     self.fd["house_room"]=""
                 if house_type.find("厅")!=-1:
                     self.fd["house_hall"]=house_type[blank:house_type.find("厅")]
                     blank=house_type.find("厅")+3
                 else:
                     self.fd["house_hall"]=""
                 if house_type.find("卫")!=-1:
                     self.fd["house_toilet"]=house_type[blank:house_type.find("卫")]
                 else:
                     self.fd["house_toilet"]=""
                 self.fd["house_floor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(1) or ""
                 self.fd["house_topfloor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(2) or ""
                 self.fd["house_age"]=re.search(self.ha_r, page)!=None and re.search(self.ha_r, page).group(1) or ""
                 self.fd["house_sup"]=re.search(self.hs_r, page)!=None and re.search(self.hs_r, page).group(1) or ""
                 self.fd["house_desc"]=re.search(self.hd_r, page)!=None and re.search(self.hd_r, page).group(1) or ""
                 self.fd["borough_name"]=re.search(self.nm_r, page)!=None and re.search(self.nm_r, page).group(1) or ""
                 makePath(homepath,self.folder,url)
         for ddd in  self.fd.items():
             print ddd[0],ddd[1]
     
         print "="*60
Beispiel #10
0
    def extractDict(self):        
        if checkPath(homepath,self.folder,self.urls):
            pass
        else:
            try:
                if self.kind=="1":
                    self.sell(self.urls)
                elif self.kind=="2":
                    self.rent(self.urls)
                elif self.kind=="3":
                    self.buy(self.urls)
                else:
                    self.require(self.urls)
                makePath(homepath,self.folder,self.urls)                
                #超过七天
                
#                if (time.time() -self.fd["posttime"]) > 7*24*36000:return
            except Exception,e:
                self.fd['house_title']=None
                msglogger.info("%s 链接采集异常"%self.urls)
#                print "%s||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||"%self.urls
            if isDEV:
#                self.fd.update(getDefaultVal(4))
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
                for item in dfv.items() :
                    print item[0],self.fd[item[0]],type(self.fd[item[0]])
                    
                return
            else:
                dfv=getDefaultVal(self.kind)
                for item in dfv.items() :
#                    print item[0],item[1]
                    if item[0] not in  self.fd:
                        self.fd[item[0]]=dfv.get(item[0])
            self.fd["is_checked"] = 1        
            self.fd["web_flag"]   = "gj"
            #print "%s %s %s %s %s"%(("%s.soufun.com"% self.citycode),self.citycode, self.kind ,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())), self.urls)
            return self.fd
        
            if not self.fd["is_checked"]:
                for i in self.fd.items():
                    print i[0],i[1]
            print  "*"*80