コード例 #1
0
ファイル: tongcheng58.py プロジェクト: aviatorBeijing/ptpy
    def buy(self,url):
        self.fd['house_flag'] = 3
        hc= urlparse(url)[1].replace('.58.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc        
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)
        soup =BeautifulSoup(response)
        
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =str(detail_mer).replace(" ", "")
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname != '个人房源':return            
        else:
            return 
        
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = ""

        owner_phone = soup('img')
        self.fd['owner_phone_pic'] = ''
        for phone in owner_phone:
            if phone['src'].find('http://image.58.com/showphone.aspx') != -1:
                self.fd['owner_phone_pic'] = phone['src']
            
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:return
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime)
            posttime = posttime.replace('发布时间:','').replace(' 浏览','')
        else:
            posttime = ''
                            
        if not posttime:
            return                            
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['house_posttime'] = posttime
                            
        if (time.time() - self.fd['house_posttime']) > 3600*24*7: 
            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        self.fd['house_floor'] = 0
        self.fd['house_topfloor'] = 0 
        
        if re.search(self.house_totalarea_req_regex, detail_mer_str):
            house_totalarea_min=re.search(self.house_totalarea_req_regex, detail_mer_str).group(1)
            house_totalarea_max=re.search(self.house_totalarea_req_regex, detail_mer_str).group(2)
            self.fd['house_area'] = int(house_totalarea_min)
            self.fd['house_area_max'] = int(house_totalarea_max)
        else:
            if re.search(self.house_totalarea_regex, detail_mer_str):
                house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
                self.fd['house_area'] = int(house_totalarea)
                self.fd['house_area_max'] = int(house_totalarea)
            else:                
                self.fd['house_area'] = 0
                self.fd['house_area_max'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str)
           
        house_price = detail_mer.em.string
        if house_price=="面议":
            house_price="0"
#        print house_price
        if house_price.find('-') !=-1:
            self.fd['house_price_max'] = int(house_price.split('-')[0])
            self.fd['house_price_min'] = int(house_price.split('-')[1])
            self.fd['house_price'] = int(house_price.split('-')[0])
        else:
            self.fd['house_price_min']  = int(house_price)
            self.fd['house_price_min'] = int(house_price)
            self.fd['house_price'] = int(house_price)        
            
        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] = int(house_room)
            self.fd['house_room1'] = int(house_room)
        else:
            self.fd['house_room'] = 0
            self.fd['house_room1'] = 0
            
        self.fd['house_hall'] = 0
        self.fd['house_toilet'] = 0
        self.fd['house_toilet'] = 0
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc)
        else:
            self.fd['house_desc'] = ""

        #小区名
        if re.search(self.house_addr_regex, detail_mer_str):
            house_addr = re.search(self.house_addr_regex, detail_mer_str).group(1)
            self.fd['house_addr'] = house_addr
#            self.fd['borough_name'] = house_addr
            
        else:
            self.fd['house_addr'] = ''
#            self.fd['borough_name'] = ''   
        
        #区域     
        lis=PyQuery(unicode(repr(detail_mer),"UTF-8"))("li")
        for li in lis:
            lit=PyQuery(li).text()
            if "区域:" in lit:
                ls=PyQuery(li)("a")
                if len(ls)==1:
                    self.fd['house_region'] = PyQuery(ls.eq(0)).text()
                elif len(ls)==2:
                    self.fd['house_region'] = PyQuery(ls.eq(0)).text()
                    self.fd['house_section'] = PyQuery(ls.eq(1)).text()
                break
        
        #print detail_mer
#        area=detail_mer.find(text=u"地段:")
#        if area :
#            area_box = area.parent.parent
#            area_a = area_box('a')
#            if area_a and len(area_a)>1:
#                self.fd['house_region'] = str(area_a[0].string)
#                self.fd['house_section'] = str(area_a[1].string)
#            elif area_a and len(area_a)==1:
#                self.fd['house_region'] = str(area_a[0].string)
#                self.fd['house_section'] = ""
#            else:
#                self.fd['house_region'] = ""
#                self.fd['house_section'] = ""
        else:
            self.fd['house_region'] = ""
            self.fd['house_section'] = ""
        self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = 0
        self.fd['house_fitment'] = 0
        request = None
        response = None
        soup=None
        del request
        del response
        del soup