예제 #1
0
    def __init__(self):
        self.Openpage=Openpage()
        #self.Shophref=ShopHerf()
        self.Getjinwei=GetJinWei()
        #self.Mulu=MuLU()
        self.shophref=[]
	self.path=''
예제 #2
0
class ShopMessage():
    def __init__(self):
        self.Openpage=Openpage()
        #self.Shophref=ShopHerf()
        self.Getjinwei=GetJinWei()
        #self.Mulu=MuLU()
        self.shophref=[]
	self.path=''
    def Get_photo(self,url):
	urlss=url+'/photos?'
	myPage=self.Openpage.Getpage(urlss)
	page_numbers=re.findall('class="PageLink" title="(.*?)"',myPage,re.S)
	if  page_numbers:
	    #print page_numbers
	    page_number=page_numbers[-1]#[0]
	    #print 'page_number=%s'%page_number
	    for i in range(1,int(page_number)+1):
		urls=url+'/photos?pg='+str(i)
		if i==1:
		    page_hrefs=re.findall('<img src="(.*?)"',myPage,re.S)
		    f12=open(self.path+'/'+'photo_href'+'.txt','a+')
		    for href in page_hrefs:
			href=href.replace('(249c249)','(700c700)')
			href=href.replace('(249x249)','(700x700)')	
			href=href.replace('(240c180)','(700x700)')
			f12.write(href)
			f12.write('\r\n')
		    f12.close()
		else:
		    myPage=self.Openpage.Getpage(urls)
		    page_hrefs=re.findall('<img src="(.*?)"',myPage,re.S)
		    f12=open(self.path+'/'+'photo_href'+'.txt','a+')
		    for href in page_hrefs:
			href=href.replace('(249c249)','(700c700)')
			href=href.replace('(249x249)','(700x700)')
			href=href.replace('(240c180)','(700x700)')
			f12.write(href)
			f12.write('\r\n')
		    f12.close()
	else:
	    page_hrefs=re.findall('<img src="(.*?)"',myPage,re.S)
	    f12=open(self.path+'/'+'photo_href'+'.txt','a+')
	    for href in page_hrefs:
		href=href.replace('(249c249)','(700c700)')
		href=href.replace('(249x249)','(700x700)')	
		href=href.replace('(240c180)','(700x700)')
		f12.write(href)
		f12.write('\r\n')
	    f12.close()    
	    
    def Save_message(self,a,j):
	new_path=self.path
	if not os.path.isdir(new_path):
	    os.makedirs(new_path)
	f=open(new_path+'/'+a[0]+'.txt','w+')	
	files=open('DZDP0322'+'.txt','a+')
        for i in range(len(a)):
            files.write(str(a[i]))
	    files.write('qq')
	    f.writelines(str(a[i]))
	    f.write('\r\n')
	f.close()
        files.write('\r\n')
	files.write('fenjiexian')
	files.write('\r\n')
	files.close()
    def Deal_mypage(self,myPage,uid,cate):
       #myPage=''''''
	self.path=''
        a=[]
        message=[]
	shopnamess=''
	if myPage:
        	fenji=re.findall('<a href="http://www.dianping.com/search/category/(.*?)" itemprop="url">(.*?)</a>',myPage,re.S)# 
	else:
	    self.Openpage.key=self.Openpage.key+1
	    return 0
	    
	#print fenji
	#raw_input('ssssssss')
        shopname=re.findall('<h1 class="shop-name">(.*?)<a class=',myPage,re.S)
        address=re.findall('<span itemprop="locality region">(.*?)</span></a>',myPage,re.S)
        address2=re.findall('<span class="item" itemprop="street-address" title="(.*?)">',myPage,re.S)
        tel=re.findall('<span class="item" itemprop="tel">(.*?)</span>',myPage,re.S)
        time=re.findall('<span class="info-name">营业时间:</span>(.*?)<span class="item">(.*?)</span>',myPage,re.S)
        tag=re.findall('rel="tag" target="_blank">(.*?)</a>',myPage,re.S)
        poi=re.findall('poi: "(.*?)"',myPage,)
        shanghujieshao=re.findall('<span class="info-name">商户简介:</span>(.*?)</p>',myPage,re.S)
        photo_href=re.findall('<a class="item" target="_blank" rel="nofollow" href="(.*?)" title="(.*?)">(.*?)<img src="(.*?)"/>(.*?)</a>',myPage,re.S)
        message.append(uid)
	self.path='data0322/'+cate
        for i in shopname:
            i=i.replace('\n','')
	    i=i.replace('\n    ','')
	    shopnamess=i
            message.append(i)
        city='苏州'
	self.path=self.path+'/'+city
        message.append(city)
        k=0
        for i in fenji:
	    if len(fenji)<=2:
		if k<1:
		    t=i[1]
		    t=t.replace('\n            ','')
		    t=t.replace('\n        ','')
		    message.append(t)
	    else:
		if k<3:
		    t=i[1]
		    t=t.replace('\n            ','')
		    t=t.replace('\n        ','')
		    message.append(t)		
		    k=k+1
        message.append(cate)
	if len(tel):
	    for i in tel :
		if len(tel)<2:
		    message.append(' ')
		    message.append(i)
		else:
		    message.append(i)
	else:
	    message.append('null')
	    message.append('null')
        for i in time:
            t=i[1]
            t=t.replace('\n                    ','')
            t=t.replace('\n                ','')
            t=t.replace('\r\n\n                ','')
            t=t.replace('\r\n','')  	    
            message.append(t) 
        j=''   
        for i in tag:
            j=j+i+' '
        message.append(j)
	if poi:
	    key=poi[0]
	    a=self.Getjinwei.decode(key)
	    for i in a:
		message.append(i)
	else:
	    for i in range(2):
		message.append('0.0')
	if address and address2:   
	    adds=address[0]+address2[0]
	    adds=adds.replace(',','')
	    message.append(adds)
        if shanghujieshao:
            for i in shanghujieshao:
                message.append(i)
        else:
            message.append('暂无商户简介')
        #for i in message:
            #print i 
	self.path=self.path+'/'+str(uid)
	
	
        return message
    def Get_Message(self,p):
	self.shophref=[]
        f=open('shop_href'+'.txt','r')
	j=1
        for line in f.readlines():
	    if j>p:
		line=line.strip('\r\n')
		#if line.find('" rel="nofollow')!=-1:
		line=line.replace('" rel="nofollow','')
		self.shophref.append(line)
	    j=j+1
        print len(self.shophref)
        f.close()	
        for i in range(0,len(self.shophref),2):
            message=[]
	    self.path=''
            url=self.shophref[i]
            print url
            key=self.shophref.index(url)
            cate=self.shophref[i+1]
            #url='http://www.dianping.com/shop/11563687'
            uids=re.findall('\d+',url)
            uid=uids[0]
	    try:
		myPage=self.Openpage.Getpage(url)
		message=self.Deal_mypage(myPage,uid,cate)
		if message==0:
		   #self.Get_Message(p)
		    myPage=self.Openpage.Getpage(url)
		    message=self.Deal_mypage(myPage,uid,cate)		    
		self.Save_message(message,j)
		self.Get_photo(url)
		p=p+2
		print "now is %d page"%p
		#raw_input('wwwwwwwww')
	    except urllib2.HTTPError:
		#print j
		
		self.Get_Message(p)            
	    except Exception , e:
		print e
		self.Get_Message(p)