Example #1
0
 def extractDict(self):
     self.fd["citycode"]=self.citycode
     for url in self.urls:
         if checkPath(homepath,self.folder,url):
             continue
         req=urllib2.Request(url, None, self.header)
         page=self.br.open(req).read()
         if re.search(self.ht_r, page):
             if "商铺"==re.search(self.ht_r, page).group(1):
                 continue
             else:
                 ht=housetype(re.search(self.ht_r, page).group(1))
                 self.fd["house_type"]=ht
                 #lambda a: a and self.fd["borough_section"]=a.group(1) or self.fd["borough_section"]=""
                 self.fd["borough_section"]=re.search(self.ad_r, page)!=None and re.search(self.ad_r, page).group(1) or ""
                 self.fd["cityarea"]=re.search(self.ca_r, page)!=None and re.search(self.ca_r, page).group(1) or ""
                 self.fd["house_fitment"]=re.search(self.fm_r, page)!=None and re.search(self.fm_r, page).group(1) or ""
                 self.fd["house_kind"]=self.kind
                 self.fd["belong"]=re.search(self.bl_r, page)!=None and re.search(self.bl_r, page).group(1) or ""
                 self.fd["house_price"]=re.search(self.hp_r, page)!=None and re.search(self.hp_r, page).group(1) or ""
                 self.fd["house_totalarea"]=re.search(self.hta_r, page)!=None and re.search(self.hta_r, page).group(1) or ""
                 house_type=re.search(self.hrht_r, page)!=None and re.search(self.hrht_r, page).group(1) or ""
                 blank=0
                 if house_type.find("室")!= -1:
                     self.fd["house_room"]=house_type[blank:house_type.find("室")]
                     blank=house_type.find("室")+3
                 else:
                     self.fd["house_room"]=""
                 if house_type.find("厅")!=-1:
                     self.fd["house_hall"]=house_type[blank:house_type.find("厅")]
                     blank=house_type.find("厅")+3
                 else:
                     self.fd["house_hall"]=""
                 if house_type.find("卫")!=-1:
                     self.fd["house_toilet"]=house_type[blank:house_type.find("卫")]
                 else:
                     self.fd["house_toilet"]=""
                 self.fd["house_floor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(1) or ""
                 self.fd["house_topfloor"]=re.search(self.hf_r, page)!=None and re.search(self.hf_r, page).group(2) or ""
                 self.fd["house_age"]=re.search(self.ha_r, page)!=None and re.search(self.ha_r, page).group(1) or ""
                 self.fd["house_sup"]=re.search(self.hs_r, page)!=None and re.search(self.hs_r, page).group(1) or ""
                 self.fd["house_desc"]=re.search(self.hd_r, page)!=None and re.search(self.hd_r, page).group(1) or ""
                 self.fd["borough_name"]=re.search(self.nm_r, page)!=None and re.search(self.nm_r, page).group(1) or ""
                 makePath(homepath,self.folder,url)
         for ddd in  self.fd.items():
             print ddd[0],ddd[1]
     
         print "="*60
Example #2
0
    def QiuZu(self,url):
        self.fd['house_flag'] = 3
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0        
        self.fd['house_age'] = 0
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        self.fd['house_deposit'] = 0
        self.fd['house_totalarea_max'] = 0
        self.fd['house_totalarea_min'] = 0
        self.fd['house_totalarea'] = 0
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        tree = etree.HTML(response)        
        soup =BeautifulSoup(response)
        
        
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =str(detail_mer).replace(" ", "")
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname == '经纪人':return            
        else:
            return 
        
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = None

        owner_phone = soup('img')
        self.fd['owner_phone'] = ''
        for phone in owner_phone:
            if phone['src'].find('http://image.58.com/showphone.aspx') != -1:
                self.fd['owner_phone'] = phone['src']
            
        #没有联系方式  return
        if not self.fd['owner_phone']:return
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime.replace(" ", " "))
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            posttime = ''
        print posttime                     
        if not posttime:
            return                             
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['posttime'] = posttime
                            
        if (time.time() - self.fd['posttime']) > 3600*24*7: 
            return
            print "++++++++++++++++"                 
        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        
        if re.search(self.house_totalarea_req_regex, detail_mer_str):
            house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1)
            house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2)
            self.fd['house_totalarea'] = house_totalarea_min
            self.fd['house_totalarea_max'] = house_totalarea_max
            self.fd['house_totalarea_min'] = house_totalarea_min
        else:
            if re.search(self.house_totalarea_regex, detail_mer_str):
                house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
                self.fd['house_totalarea'] = house_totalarea
                self.fd['house_totalarea_max'] = house_totalarea
                self.fd['house_totalarea_min'] = house_totalarea
            else:                
                self.fd['house_totalarea'] = 0
                self.fd['house_totalarea_max'] = 0
                self.fd['house_totalarea_min'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str)
        
        house_price = detail_mer.em.string
        if house_price:
            house_price = house_price.replace('元','')
            if house_price.find("以上") != -1:
                self.fd['house_price_max'] = 0
                self.fd['house_price_min'] = house_price.replace('以上','')
                self.fd['house_price'] = house_price.replace('以上','')
            elif house_price.find("以下") != -1:
                self.fd['house_price_max'] = house_price.replace('以下','')
                self.fd['house_price_min'] = 0
                self.fd['house_price'] = house_price.replace('以下','')
            elif house_price.find("-") != -1:
                self.fd['house_price_max'] = house_price.split('-')[1]
                self.fd['house_price_min'] = house_price.split('-')[0]
                self.fd['house_price'] = house_price.split('-')[0]
            else:
                self.fd['house_price_max'] = 0
                self.fd['house_price_min'] = 0
                self.fd['house_price'] = 0
        else:
            self.fd['house_price_max'] = 0
            self.fd['house_price_min'] = 0
            self.fd['house_price'] = 0

        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] = house_room
            self.fd['house_room1'] = house_room
        else:
            self.fd['house_room'] = '0'
            self.fd['house_room1'] = '0'
            
        self.fd['house_hall'] = '0'
        self.fd['house_toilet'] = '0'
        self.fd['house_toilet'] = '0'
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = None

        #小区名
        if re.search(self.house_addr_regex, detail_mer_str):
            house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1)
            self.fd['house_addr'] = house_addr
            self.fd['borough_name'] = house_addr
            
        else:
            self.fd['house_addr'] = ''
            self.fd['borough_name'] = ''   
        
        #区域     
        #print detail_mer
        area_box = detail_mer.find(text="地段:").parent.parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = area_a[1].string
        elif area_a and len(area_a)==1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = None
        else:
            self.fd['cityarea'] = None
            self.fd['section'] = None
        
        self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
Example #3
0
File: ganji.py Project: ptphp/PyLib
    def sell(self,url):
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            raise 
        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        self.fd['house_flag'] = 1
        self.fd['house_belong']=0
        self.fd['owner_phone']=''
        self.fd['house_area_max']=0
        self.fd['house_price_max']=''
        
        detail_mer = soup.find('div',{'class':'detail_mer'})        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):raise        
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = str(Dname.string)
        else:
            self.fd['owner_name'] = None
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone_pic'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone_pic'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone_pic'] = None            
        else:
            self.fd['owner_phone_pic'] = None            
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:raise     
        
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            raise   
        
        if re.search(self.house_floor_regex, response):
            house_floor=re.search(self.house_floor_regex, response).group(1)
            house_topfloor=re.search(self.house_floor_regex, response).group(2)
            self.fd['house_floor']    = int(house_floor)
            self.fd['house_topfloor'] = int(house_topfloor)
        else:
            self.fd['house_floor'] = 0
            self.fd['house_topfloor'] = 0   
        
        if re.search(self.house_totalarea_regex, response):
            house_totalarea=re.search(self.house_totalarea_regex, response).group(1)
            self.fd['house_area'] = int(house_totalarea)
        else:
            self.fd['house_area'] = 0
            
        #类型 
        if re.search(self.house_type_regex, response):
            house_type=re.search(self.house_type_regex, response).group(1)
            self.fd['house_type'] = housetype(house_type)
        else:
            self.fd['house_type'] = 6   
            
        if re.search(self.house_price_regex, response):
            house_price=re.search(self.house_price_regex, response).group(1)
            if house_price=="面议":
                house_price=0
            self.fd['house_price'] = int(house_price)
        else:
            self.fd['house_price'] = 0
    
        #posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        #if posttime:
            #Y=int(time.strftime('%Y', time.localtime()))
            #M=int(posttime.split(' ')[0].split('-')[0])
            #D=int(posttime.split(' ')[0].split('-')[1])
            #s = datetime.datetime(Y,M,D,0,0)
            #posttime=int(time.mktime(s.timetuple()))
            #self.fd['house_posttime'] =posttime 
        #else:
            #self.fd['house_posttime'] =None
            
        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = int(house_hall)
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = int(house_toilet)
        else:
            self.fd['house_toilet'] = 0
            
        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = int(house_veranda)
        else:
            self.fd['house_veranda'] = 0    
        

        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = ""

        d_i = soup.find('ul',{'class':'d_i'})
        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if d_i.find(text="小区: "):
                borough_box = d_i.find(text="小区: ").parent        
                borough_name = borough_box.find("a")
                if borough_name:
                    self.fd['borough_name'] = str(borough_name.string)
                else:
                    self.fd['borough_name'] = None            
                #地址
                if borough_name and borough_name.nextSibling:
                    house_addr = borough_name.nextSibling.string
                    self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr)
                else:
                    self.fd['house_addr'] = ""
            else:
                if re.search(self.borough_name_regex, response):
                    borough_name=re.search(self.borough_name_regex, response).group(1)
                    self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name)
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = str(area_a[1].string)
        elif area_a and len(area_a)==1:
            self.fd['house_region'] = str(area_a[0].string)
            self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            Y=int(time.strftime('%Y', time.localtime()))
            house_age=Y-int(house_age)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = 0
            
        #朝向
        if re.search(self.house_toward_regex, response):
            house_toward=re.search(self.house_toward_regex, response).group(1)
            self.fd['house_toward'] = toward(house_toward)
        else:
            self.fd['house_toward'] = 0        
            
        if re.search(self.house_fitment_regex, response):
            house_fitment=re.search(self.house_fitment_regex, response).group(1)
            self.fd['house_fitment'] = fitment(house_fitment)
        else:
            self.fd['house_fitment'] = 2
        request = None
        response = None
        soup=None
        tree=None
        del tree
        del request
        del response
        del soup 
Example #4
0
    def rent(self,url):
        
        self.fd['house_flag'] = 2
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)  
        soup =BeautifulSoup(response)
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =re.sub("\n|\t\r| ","",str(detail_mer))
        #print detail_mer_str
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname != '个人房源':return            
        else:
            return
                
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = None

        owner_phone = soup('img')
#        print owner_phone
        self.fd['owner_phone'] = ''
        for phone in owner_phone:
            if phone['src'].find('58.com/showphone.aspx') != -1:
                self.fd['owner_phone'] = phone['src']
        #没有联系方式  return
        if not self.fd['owner_phone']:return 
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime)
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            posttime = ''
                            
        if not posttime:
            return                            
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['posttime'] = posttime
                            
        if (time.time() - self.fd['posttime']) > 3600*24*7: 
            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        if re.search(self.house_floor_regex, detail_mer_str):
            house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1)
            self.fd['house_floor']  = house_floor
        else:
            self.fd['house_floor'] = None
            
        if re.search(self.house_topfloor_regex, detail_mer_str):
            house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1)
            self.fd['house_topfloor'] = house_topfloor
        else:
            self.fd['house_topfloor'] = None   
        
        if re.search(self.house_totalarea_regex, detail_mer_str):
            house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
            self.fd['house_totalarea'] = house_totalarea
        else:
            self.fd['house_totalarea'] = None
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str) 
            
        self.fd['house_price'] = detail_mer.em.string  
            
        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] = house_room
        else:
            self.fd['house_room'] = '0'
            
        if re.search(self.house_hall_regex, detail_mer_str):
            house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1)
            self.fd['house_hall'] = house_hall
        else:
            self.fd['house_hall'] = '0'
        
        if re.search(self.house_toilet_regex, detail_mer_str):
            house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1)
            self.fd['house_toilet'] = house_toilet
        else:
            self.fd['house_toilet'] = '0'
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = None

        #小区名
        if re.search(self.borough_name_regex, detail_mer_str):
            borough_name=re.search(self.borough_name_regex, detail_mer_str).group(1)
            try:
                self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name)
            except:
                self.fd['borough_name'] =borough_name
        else:
            self.fd['borough_name'] = ''
        
        #区域     
        area=detail_mer.find(text=u"区域:")
        if area:
            area_box = area.parent.parent
            area_a = area_box('a')
            if area_a and len(area_a)>1:
                self.fd['cityarea'] = area_a[0].string
                self.fd['section'] = area_a[1].string
            elif area_a and len(area_a)==1:
                self.fd['cityarea'] = area_a[0].string
                self.fd['section'] = ""
            else:
                self.fd['cityarea'] = ""
                self.fd['section'] = ""
        else:
                self.fd['cityarea'] = ""
                self.fd['section'] = ""
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = None
            
        #朝向
        self.fd['house_toward'] = toward(detail_mer_str)    
        self.fd['house_fitment'] = fitment(detail_mer_str)        
        self.fd['house_deposit'] = deposit(detail_mer_str)
        request = None
        response = None
        soup=None
        del request
        del response
        del soup
Example #5
0
    def ChuShou(self,url):
        self.fd['city'] = ''        
        self.fd['house_flag'] = 1
        self.fd['belong']=""
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        detail_mer = soup.find('div',{'class':'detail_mer'})
        
        #非个人房源 return
        if u"个人房源"  not in str(detail_mer):return        
        
        Dname = detail_mer.find('span',{'class':'Dname'})
        if Dname:
            self.fd['owner_name'] = Dname.string
        else:
            self.fd['owner_name'] = None
            
        ganji_phone_call_class = detail_mer.find('span',{'class':'ganji_phone_call_class'})
        
        if ganji_phone_call_class:
            self.fd['owner_phone'] = ganji_phone_call_class.contents[0]
            if str(ganji_phone_call_class).find('src='):                
                self.fd['owner_phone'] = 'http://'+urlparse(url)[1]+ganji_phone_call_class.img['src']
            else:
                self.fd['owner_phone'] = None            
        else:
            self.fd['owner_phone'] = None
            
            
        #没有联系方式  return
        if not self.fd['owner_phone']:return     
        
        if re.search("<span class=\"city\"><a .*?>(.*?)</a>", response):
            cityname=re.search("<span class=\"city\"><a .*?>(.*?)</a>", response).group(1)
            self.fd['cityname'] = cityname
        else:
            return   
        
        if re.search(self.house_floor_regex, response):
            house_floor=re.search(self.house_floor_regex, response).group(1)
            house_topfloor=re.search(self.house_floor_regex, response).group(2)
            self.fd['house_floor']    = house_floor
            self.fd['house_topfloor'] = house_topfloor
        else:
            self.fd['house_floor'] = None
            self.fd['house_topfloor'] = None   
        
        if re.search(self.house_totalarea_regex, response):
            house_totalarea=re.search(self.house_totalarea_regex, response).group(1)
            self.fd['house_totalarea'] = house_totalarea
        else:
            self.fd['house_totalarea'] = None
            
        #类型 
        if re.search(self.house_type_regex, response):
            house_type=re.search(self.house_type_regex, response).group(1)
            self.fd['house_type'] = housetype(house_type)
        else:
            self.fd['house_type'] = None   
            
        if re.search(self.house_price_regex, response):
            house_price=re.search(self.house_price_regex, response).group(1)
            self.fd['house_price'] = house_price
        else:
            self.fd['house_price'] = None
    
        posttime=CSSSelector('span.pub_time')(tree)!=None and CSSSelector('span.pub_time')(tree)[0].text.strip() or None 
        if posttime:
            Y=int(time.strftime('%Y', time.localtime()))
            M=int(posttime.split(' ')[0].split('-')[0])
            D=int(posttime.split(' ')[0].split('-')[1])
            s = datetime.datetime(Y,M,D,0,0)
            posttime=int(time.mktime(s.timetuple()))
            self.fd['posttime'] =posttime 
        else:
            self.fd['posttime'] =None
            
        if re.search(self.house_room_regex, response):
            house_room=re.search(self.house_room_regex, response).group(1)
            self.fd['house_room'] = house_room
        else:
            self.fd['house_room'] = '0'
            
        if re.search(self.house_hall_regex, response):
            house_hall=re.search(self.house_hall_regex, response).group(1)
            self.fd['house_hall'] = house_hall
        else:
            self.fd['house_hall'] = '0'
        
        if re.search(self.house_toilet_regex, response):
            house_toilet=re.search(self.house_toilet_regex, response).group(1)
            self.fd['house_toilet'] = house_toilet
        else:
            self.fd['house_toilet'] = '0'

        house_title=CSSSelector("div.detail_title h1")(tree)[0] !=None and CSSSelector("div.detail_title h1")(tree)[0].text.strip() or None
        self.fd['house_title'] = house_title
        
        #描述        
        detail_box = soup.find('div',{'class':'detail_box'})
        if detail_box:
            house_desc = str(detail_box('p')[1])
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时请说明是从赶集网上看到的","",house_desc)
        else:
            self.fd['house_desc'] = None

        d_i = soup.find('ul',{'class':'d_i'})
        
        #小区名
        #先处理JS
        if re.search(self.xiaoqu_regex, response):
            borough_name=re.search(self.xiaoqu_regex, response).group(1)
            self.fd['borough_name'] = borough_name
            print borough_name
            if re.search(self.address_regex, response):
                house_addr=re.search(self.address_regex, response).group(1)
                self.fd['house_addr'] = house_addr
        else:            
            if d_i.find(text="小区: "):
                borough_box = d_i.find(text="小区: ").parent        
                borough_name = borough_box.find("a")
                if borough_name:
                    self.fd['borough_name'] = borough_name.string
                else:
                    self.fd['borough_name'] = None            
                #地址
                if borough_name and borough_name.nextSibling:
                    house_addr = borough_name.nextSibling.string
                    self.fd['house_addr'] = re.sub("\(|\)| ","",house_addr)
                else:
                    self.fd['house_addr'] = None
            else:
                if re.search(self.borough_name_regex, response):
                    borough_name=re.search(self.borough_name_regex, response).group(1)
                    self.fd['borough_name'] = re.sub("\(.*\)| ","",borough_name)
            
        #区域     
        area_box = d_i.find(text="区域: ").parent
        area_a = area_box('a')
        if area_a and len(area_a)>1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = area_a[1].string
        elif area_a and len(area_a)==1:
            self.fd['cityarea'] = area_a[0].string
            self.fd['section'] = None
        else:
            self.fd['cityarea'] = None
            self.fd['section'] = None
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = None
            
        #朝向
        if re.search(self.house_toward_regex, response):
            house_toward=re.search(self.house_toward_regex, response).group(1)
            self.fd['house_toward'] = toward(house_toward)
        else:
            self.fd['house_toward'] = None        
            
        if re.search(self.house_fitment_regex, response):
            house_fitment=re.search(self.house_fitment_regex, response).group(1)
            self.fd['house_fitment'] = fitment(house_fitment)
        else:
            self.fd['house_fitment'] = 2
Example #6
0
    def require(self,url):
        self.fd['house_flag'] = 4
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0        
        self.fd['house_age'] = 0
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        self.fd['house_deposit'] = 0
        self.fd['house_totalarea_max'] = 0
        self.fd['house_totalarea_min'] = 0
        self.fd['house_totalarea'] = 0
        hc= urlparse(url)[1].replace('.58.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc 
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)        
        soup =BeautifulSoup(response)
        
        
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =str(detail_mer).replace(" ", "")
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname == '经纪人':
                self.fd['is_ok']=False
                return       
        else:
            return 
        
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = ""

        owner_phone = soup('img')
        self.fd['owner_phone_pic'] = ''
        for phone in owner_phone:
            if phone['src'].find('http://image.58.com/showphone.aspx') != -1:
                self.fd['owner_phone_pic'] = phone['src']
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:return
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime.replace("&nbsp;", " "))
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            posttime = ''
#        print posttime                     
        if not posttime:
            return                             
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['house_posttime'] = posttime
                            
        if (time.time() - self.fd['house_posttime']) > 3600*24*7: 
            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        
        if re.search(self.house_totalarea_req_regex, detail_mer_str):
            house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1)
            house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2)
            self.fd['house_area'] = int(house_totalarea_min)
            self.fd['house_area_max'] = int(house_totalarea_max)
        else:
            if re.search(self.house_totalarea_regex, detail_mer_str):
                house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
                self.fd['house_area'] = int(house_totalarea)
                self.fd['house_area_max'] = int(house_totalarea)
            else:                
                self.fd['house_area'] = 0
                self.fd['house_area_max'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str)
        
        house_price = detail_mer.em.string
        if house_price=="面议":
            house_price="0"
        if house_price:
            house_price = house_price.replace('元','')
            if house_price.find("以上") != -1:
                self.fd['house_price_max'] = 0
                self.fd['house_price_min'] = house_price.replace('以上','')
                self.fd['house_price'] = house_price.replace('以上','')
            elif house_price.find("以下") != -1:
                self.fd['house_price_max'] = house_price.replace('以下','')
                self.fd['house_price_min'] = 0
                self.fd['house_price'] = house_price.replace('以下','')
            elif house_price.find("-") != -1:
                self.fd['house_price_max'] = house_price.split('-')[1]
                self.fd['house_price_min'] = house_price.split('-')[0]
                self.fd['house_price'] = house_price.split('-')[0]
            else:
                self.fd['house_price_max'] = 0
                self.fd['house_price_min'] = 0
                self.fd['house_price'] = 0
        else:
            self.fd['house_price_max'] = 0
            self.fd['house_price_min'] = 0
            self.fd['house_price'] = 0

        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] = int(house_room)
            self.fd['house_room1'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            self.fd['house_room1'] = 0
            
        self.fd['house_hall'] = 0
        self.fd['house_toilet'] = 0
        self.fd['house_toilet'] = 0
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = None

        #小区名
        if re.search(self.house_region_regex, detail_mer_str):
            house_addr = re.search(self.house_region_regex, detail_mer_str).group(1)
            ha=re.search("<a.*>(.*)</a>&nbsp;<a.*>(.*)</a>",house_addr)
            try:
                self.fd['house_region'] = ha.group(1)
                self.fd['house_section'] = ha.group(2)
            except:
                pass
#            lss=PyQuery(unicode(house_addr,"utf-8"))('a')
#            if len(lss)==1:
#                self.fd['house_region'] = PyQuery(lss[0]).text()
#            elif len(lss)==2:
#                self.fd['house_region'] = PyQuery(lss[1]).text()
#                self.fd['borough_name'] = PyQuery(lss[0]).text()
#            self.fd['house_addr'] = re.sub("\(.*\)|<.*?>","",house_addr).replace('&nbsp;'," ")
#            self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",house_addr).replace('&nbsp;'," ")
#            self.fd['house_region'] = re.sub("\(.*\)|<.*?>","",house_addr).replace('&nbsp;'," ")
            
        else:
            self.fd['house_addr'] = ''
            self.fd['borough_name'] = '' 
            self.fd['house_region'] =""  
        
        #区域     
        #print detail_mer
        area=detail_mer.find(text="地段:")
        if area:
            area_box = area.parent.parent
            area_a = area_box('a')
            if area_a and len(area_a)>1:
                self.fd['cityarea'] = area_a[0].string
                self.fd['section'] = area_a[1].string
            elif area_a and len(area_a)==1:
                self.fd['cityarea'] = area_a[0].string
                self.fd['section'] = None
            else:
                self.fd['cityarea'] = None
                self.fd['section'] = None
        else:
                self.fd['cityarea'] = None
                self.fd['section'] = None
            
        self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        request = None
        response = None
        soup=None
        del request
        del response
        del soup
Example #7
0
    def buy(self,url):
        self.fd['house_flag'] = 3
        hc= urlparse(url)[1].replace('.58.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc        
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =str(detail_mer).replace(" ", "")
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname != '个人房源':return            
        else:
            return 
        
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = ""

        owner_phone = soup('img')
        self.fd['owner_phone_pic'] = ''
        for phone in owner_phone:
            if phone['src'].find('http://image.58.com/showphone.aspx') != -1:
                self.fd['owner_phone_pic'] = phone['src']
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:return
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime)
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            posttime = ''
                            
        if not posttime:
            return                            
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['house_posttime'] = posttime
                            
        if (time.time() - self.fd['house_posttime']) > 3600*24*7: 
            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        
        if re.search(self.house_totalarea_req_regex, detail_mer_str):
            house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1)
            house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2)
            self.fd['house_area'] = int(house_totalarea_min)
            self.fd['house_area_max'] = int(house_totalarea_max)
        else:
            if re.search(self.house_totalarea_regex, detail_mer_str):
                house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
                self.fd['house_area'] = int(house_totalarea)
                self.fd['house_area_max'] = int(house_totalarea)
            else:                
                self.fd['house_area'] = 0
                self.fd['house_area_max'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str)
           
        house_price = detail_mer.em.string
        if house_price=="面议":
            house_price="0"
#        print house_price
        if house_price.find('-') !=-1:
            self.fd['house_price_max'] = int(house_price.split('-')[0])
            self.fd['house_price_min'] = int(house_price.split('-')[1])
            self.fd['house_price'] = int(house_price.split('-')[0])
        else:
            self.fd['house_price_min']  = int(house_price)
            self.fd['house_price_min'] = int(house_price)
            self.fd['house_price'] = int(house_price)        
            
        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] = int(house_room)
            self.fd['house_room1'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            self.fd['house_room1'] = 0
            
        self.fd['house_hall'] = 0
        self.fd['house_toilet'] = 0
        self.fd['house_toilet'] = 0
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = ""

        #小区名
        if re.search(self.house_addr_regex, detail_mer_str):
            house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1)
            self.fd['house_addr'] = house_addr
#            self.fd['borough_name'] = house_addr
            
        else:
            self.fd['house_addr'] = ''
#            self.fd['borough_name'] = ''   
        
        #区域     
        lis=PyQuery(unicode(repr(detail_mer),"UTF-8"))("li")
        for li in lis:
            lit=PyQuery(li).text()
            if "区域:" in lit:
                ls=PyQuery(li)("a")
                if len(ls)==1:
                    self.fd['house_region'] = PyQuery(ls.eq(0)).text()
                elif len(ls)==2:
                    self.fd['house_region'] = PyQuery(ls.eq(0)).text()
                    self.fd['house_section'] = PyQuery(ls.eq(1)).text()
                break
        
        #print detail_mer
#        area=detail_mer.find(text=u"地段:")
#        if area :
#            area_box = area.parent.parent
#            area_a = area_box('a')
#            if area_a and len(area_a)>1:
#                self.fd['house_region'] = str(area_a[0].string)
#                self.fd['house_section'] = str(area_a[1].string)
#            elif area_a and len(area_a)==1:
#                self.fd['house_region'] = str(area_a[0].string)
#                self.fd['house_section'] = ""
#            else:
#                self.fd['house_region'] = ""
#                self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        request = None
        response = None
        soup=None
        del request
        del response
        del soup           
Example #8
0
    def sell(self,url):
        hc= urlparse(url)[1].replace('.58.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc        
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        self.fd['house_flag'] = 1
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =str(detail_mer).replace(" ", "")
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname != '个人房源':
                self.fd['is_ok']=False
                return            
        else:
            return 
        
        
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = None

        owner_phone = soup('img')
        self.fd['owner_phone_pic'] = ''
        for phone in owner_phone:
            if phone['src'].find('http://image.58.com/showphone.aspx') != -1:
                self.fd['owner_phone_pic'] = phone['src']
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:return 
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime)
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            s=time.localtime(time.time())
            posttime = str(int(time.mktime(s)))
                            
        if not posttime:
            self.fd['house_posttime'] = time.time()                            
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['house_posttime'] = posttime
                            
#        if (time.time() - self.fd['posttime']) > 3600*24*7: 
#            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        if re.search(self.house_floor_regex, detail_mer_str):
            house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1)
            self.fd['house_floor']  = int(house_floor)
        else:
            self.fd['house_floor'] = 0
            
        if re.search(self.house_topfloor_regex, detail_mer_str):
            house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1)
            self.fd['house_topfloor'] = int(house_topfloor)
        else:
            self.fd['house_topfloor'] = 0   
        
        if re.search(self.house_totalarea_regex, detail_mer_str):
            house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
            self.fd['house_area'] = int(house_totalarea)
        else:
            self.fd['house_area'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str) 
            
        self.fd['house_price'] = detail_mer.em and int(detail_mer.em.string) or 0
            
        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, detail_mer_str):
            house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1)
            self.fd['house_hall'] = int(house_hall)
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, detail_mer_str):
            house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1)
            self.fd['house_toilet'] = int(house_toilet)
        else:
            self.fd['house_toilet'] = 0
            
        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = int(house_veranda)
        else:
            self.fd['house_veranda'] = 0
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = ""
        
        
        
        #小区名
        lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li")
        for li in lis:
            lit=PyQuery(li)
            if "小区:" in lit.text():
                xq= lit.text().replace("小区:","")
                if u"二手房信息" in xq:
                    self.fd['borough_name'] =xq[:xq.find("(")]
                else:
                    self.fd['borough_name'] =xq
                break
#        if re.search(self.borough_name1_regex, detail_mer_str):
#            borough_name=re.search(self.borough_name1_regex, detail_mer_str).group(1)
#            self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name)
#            
#        else:
#            self.fd['borough_name'] = ''
        
#        lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li")
        for li in lis:
            lit= PyQuery(li).text()
            if "地址:" in lit:
                self.fd['house_addr']=lit[lit.find(":")+1:lit.find(u"(")]
                break            
        #区域     
        try:
            area_box = detail_mer.find(text="区域:").parent.parent
            area_a = area_box('a')
            if area_a and len(area_a)>1:
                self.fd['house_region'] = str(area_a[0].string)
                self.fd['house_section'] = str(area_a[1].string)
            elif area_a and len(area_a)==1:
                self.fd['house_region'] = str(area_a[0].string)
                self.fd['house_section'] = ""
            else:
                self.fd['house_region'] = ""
                self.fd['section'] = ""
        except:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
            
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            Y=int(time.strftime('%Y', time.localtime()))
            house_age=Y-int(house_age)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = toward(detail_mer_str)    
        self.fd['house_fitment'] = fitment(detail_mer_str)
        request = None
        response = None
        soup=None
        del request
        del response
        del soup
Example #9
0
    def QiuZu(self, url):
        self.fd["house_flag"] = 3
        self.fd["house_floor"] = 0
        self.fd["house_topfloor"] = 0
        self.fd["house_age"] = 0
        self.fd["house_toward"] = 0
        self.fd["house_fitment"] = 0
        self.fd["house_deposit"] = 0
        self.fd["house_totalarea_max"] = 0
        self.fd["house_totalarea_min"] = 0
        self.fd["house_totalarea"] = 0
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        tree = etree.HTML(response)
        soup = BeautifulSoup(response)

        detail_mer = soup.find("ul", {"class": "info"})
        detail_mer_str = str(detail_mer).replace(" ", "")
        # 非个人房源 return
        # print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname = re.search(self.agencyname_regex, response).group(1)
            if agencyname == "经纪人":
                return
        else:
            return

        if re.search(self.username_regex, response):
            username = re.search(self.username_regex, response).group(1)
            self.fd["owner_name"] = username
        else:
            self.fd["owner_name"] = None

        owner_phone = soup("img")
        self.fd["owner_phone"] = ""
        for phone in owner_phone:
            if phone["src"].find("http://image.58.com/showphone.aspx") != -1:
                self.fd["owner_phone"] = phone["src"]

        # 没有联系方式  return
        if not self.fd["owner_phone"]:
            return

        if soup.find("div", {"class": "other"}):
            posttime = soup.find("div", {"class": "other"}).contents[0]
            posttime = re.sub("\n|\r| |\t", "", posttime.replace("&nbsp;", " "))
            posttime = posttime.replace("发布时间:", "").replace(" 浏览", "")
        else:
            posttime = ""
        print posttime
        if not posttime:
            return
        elif posttime.find("-") != -1:
            s = datetime.datetime(int(posttime.split("-")[0]), int(posttime.split("-")[1]), int(posttime.split("-")[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find("分钟") != -1:
            n = int(posttime.replace("分钟前", "")) * 60
            posttime = int(time.time() - n)
        elif posttime.find("小时") != -1:
            n = int(posttime.replace("小时前", "")) * 60 * 60
            posttime = int(time.time() - n)
        self.fd["posttime"] = posttime

        if (time.time() - self.fd["posttime"]) > 3600 * 24 * 7:
            return
            print "++++++++++++++++"
        print time.strftime("%Y %m %d", time.localtime(self.fd["posttime"]))

        self.fd["house_floor"] = 0
        self.fd["house_topfloor"] = 0

        if re.search(self.house_totalarea_req_regex, detail_mer_str):
            house_totalarea_min = re.search(self.house_totalarea_req_regex, detail_mer_str).group(1)
            house_totalarea_max = re.search(self.house_totalarea_req_regex, detail_mer_str).group(2)
            self.fd["house_totalarea"] = house_totalarea_min
            self.fd["house_totalarea_max"] = house_totalarea_max
            self.fd["house_totalarea_min"] = house_totalarea_min
        else:
            if re.search(self.house_totalarea_regex, detail_mer_str):
                house_totalarea = re.search(self.house_totalarea_regex, detail_mer_str).group(1)
                self.fd["house_totalarea"] = house_totalarea
                self.fd["house_totalarea_max"] = house_totalarea
                self.fd["house_totalarea_min"] = house_totalarea
            else:
                self.fd["house_totalarea"] = 0
                self.fd["house_totalarea_max"] = 0
                self.fd["house_totalarea_min"] = 0

        # 类型
        self.fd["house_type"] = housetype(detail_mer_str)

        house_price = detail_mer.em.string
        if house_price:
            house_price = house_price.replace("元", "")
            if house_price.find("以上") != -1:
                self.fd["house_price_max"] = 0
                self.fd["house_price_min"] = house_price.replace("以上", "")
                self.fd["house_price"] = house_price.replace("以上", "")
            elif house_price.find("以下") != -1:
                self.fd["house_price_max"] = house_price.replace("以下", "")
                self.fd["house_price_min"] = 0
                self.fd["house_price"] = house_price.replace("以下", "")
            elif house_price.find("-") != -1:
                self.fd["house_price_max"] = house_price.split("-")[1]
                self.fd["house_price_min"] = house_price.split("-")[0]
                self.fd["house_price"] = house_price.split("-")[0]
            else:
                self.fd["house_price_max"] = 0
                self.fd["house_price_min"] = 0
                self.fd["house_price"] = 0
        else:
            self.fd["house_price_max"] = 0
            self.fd["house_price_min"] = 0
            self.fd["house_price"] = 0

        if re.search(self.house_room_regex, detail_mer_str):
            house_room = re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd["house_room"] = house_room
            self.fd["house_room1"] = house_room
        else:
            self.fd["house_room"] = "0"
            self.fd["house_room1"] = "0"

        self.fd["house_hall"] = "0"
        self.fd["house_toilet"] = "0"
        self.fd["house_toilet"] = "0"

        if re.search(self.house_title_regex, response):
            house_title = re.search(self.house_title_regex, response).group(1)
            self.fd["house_title"] = house_title
        else:
            self.fd["house_title"] = ""

        # 描述
        detail_box = soup.find("div", {"class": "maincon"})
        if detail_box:
            house_desc = str(detail_box)
            self.fd["house_desc"] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!", "", house_desc)
        else:
            self.fd["house_desc"] = None

        # 小区名
        if re.search(self.house_addr_regex, detail_mer_str):
            house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1)
            self.fd["house_addr"] = house_addr
            self.fd["borough_name"] = house_addr

        else:
            self.fd["house_addr"] = ""
            self.fd["borough_name"] = ""

        # 区域
        # print detail_mer
        area_box = detail_mer.find(text="地段:").parent.parent
        area_a = area_box("a")
        if area_a and len(area_a) > 1:
            self.fd["cityarea"] = area_a[0].string
            self.fd["section"] = area_a[1].string
        elif area_a and len(area_a) == 1:
            self.fd["cityarea"] = area_a[0].string
            self.fd["section"] = None
        else:
            self.fd["cityarea"] = None
            self.fd["section"] = None

        self.fd["house_age"] = 0

        # 朝向
        self.fd["house_toward"] = 0
        self.fd["house_fitment"] = 0