Esempio n. 1
0
class Spider(object):
    def __init__(self):
        self.tao=Taobao()
        self.tao.main()
        self.u='http://mm.taobao.com/json/request_top_list.htm?page='
        self.rule_1='<div class="list-item">.*?<div class="pic-word">.*?<a href="//(.*?)".*?<img src="//(.*?)".*?<a class="lady-name" .*?>(.*?)</a>'
        self.rule_2='<img.*?src="//(.*?)"'
        self.rule_3='<div class="mm-aixiu-content".*?">(.*?)<!--'
    def baseurl(self,i):
        return self.u+str(i)
    def read(self,url):
        req=urllib2.Request(url)
        url=urlopen(req)
        page=url.read().decode('gbk')
        #page=page.encode('utf-8')
        return page
    def compile(self,rule):
        RE=re.compile(rule,re.S)
        return RE
    def mkdir(self,path):
        path=path.strip()
        if os.path.exists(path):
            return False
        else:
            os.makedirs(path)
            return True
    def savePicutre(self,temp,path):
        path=path.strip()
        k=0
        for i in temp:
            print i
            url=urlopen('http://'+i)
            with open(path+'\\'+str(k)+'.jpg','wb') as f:
                f.write(url.read())
            k=k+1
    def load(self,start,end):
        if isinstance(start,int) and isinstance(end,int):
            for i in range(start,end):
                 item=re.findall(self.compile(self.rule_1),self.read(self.baseurl(i)))
                 for temp in item:
                     if self.mkdir(temp[2]):
                         p=self.tao.getpage('http://'+temp[0])
                         p=p.decode('gbk')
                         p=p.encode('gbk')
                         picpart=re.search(self.compile(self.rule_3),p)
                         print picpart.group(1)
                         pic=re.findall(self.compile(self.rule_2),picpart.group(1))
                         print pic
                         self.savePicutre(pic,temp[2])
                     else:
                         raise(Exception)
        else:
            raise(Exception)