Ejemplo n.º 1
0
class Crawler(object):
    def __init__(self):
        super(Crawler, self).__init__()
        self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}'
        self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}'
        self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U)
        self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U)
        self.model_pattern = re.compile(
            '''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U)
        self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U)
        self.links = []
        self.ids = []
        self.names = []
        self.sql = SqlHelper()

    def readHtml(self, html):
        response = urllib.request.urlopen(html)
        return response.read()

    def getLinkIdAndNames(self, htmlData):

        # htmlData = htmlData.decode('utf-8')
        # htmlData = htmlData.decode();

        items = re.findall(self.model_pattern, htmlData)

        print('items:' + items)

        self.links = [link for link, name in items]
        self.names = [name.decode('gbk') for link, name in items]
        self.ids = [link[link.index('=') + 1:] for link in self.links]

    def getAlbums(self):
        for i, model_id in enumerate(self.ids):

            utils.log('start downloading:%s' % self.names[i])
            print('start downloading', self.names[i])

            # 插入用户
            command = self.sql.insert_data_to_users()
            msg = (
                model_id,
                self.names[i],
                "",
            )

            try:
                self.sql.insert_data(command, msg, commit=True)
            except Exception as e:
                utils.log('insert users data errors')

            for page in range(1, 10):
                utils.log('current page:%s' % page)
                # print ('current page', page)

                model_url = self.album_prefix.format(model_id, page)
                soup = bs(self.readHtml(model_url), 'html.parser')
                albums = soup.find_all('div', class_='mm-photo-cell-middle')
                if not albums:
                    break
                for album in albums:
                    album_name = album.find('h4').a.string.strip().rstrip('.')
                    album_link = album.find('h4').a['href']
                    album_id = re.findall(self.album_pattern, album_link)[0]
                    album_create_time = album.find(
                        'p', class_='mm-photo-date').string.strip(
                            u'创建时间: ').strip(u'´´½¨Ê±¼ä:')
                    album_img_count = album.find(
                        'span', class_='mm-pic-number').string.strip(
                            '()').strip(u'张').strip(u'ÕÅ')

                    print(">>>>>>>>>>>>>>>>>>>>>>")
                    print(album.find('p', class_='mm-photo-date').string)
                    print(album_create_time)
                    print(">>>>>>>>>>>>>>>>>>>>>>")

                    # 插入相册
                    command = self.sql.insert_data_to_albums()
                    msg = (album_id, model_id, album_name, album_create_time,
                           "", 1, album_img_count)
                    try:
                        self.sql.insert_data(command, msg, commit=True)
                    except Exception as e:
                        utils.log('insert albums data errors')

                    utils.log('start in album:%s, total size: %s' %
                              (album_name, album_img_count))

                    self.getImages(model_id, album_id, album_img_count)

    def getImages(self, model_id, album_id, image_count):
        # print 'start downloading album', album_id, image_count, '张'
        for page in range(1, (int(image_count) - 1) / 16 + 2):
            link = self.image_prefix.format(model_id, album_id, page)
            body = self.readHtml(link).decode('gbk')
            images = re.findall(self.image_pattern, body)
            # tried to use des as names, however, it duplicates times. So i chose pic ids.
            names = re.findall(self.image_name_pattern, body)
            for idx, image in enumerate(images):
                image = image.replace('290', '620')
                try:
                    img_url = ('http://' + image).replace(
                        'jpg_620x10000.jpg', 'jpg')
                except Exception as e:
                    img_url = ('http://' + image)

                # id , album_id , name  , url , kind

                # 插入图片
                command = self.sql.insert_data_to_photos()
                msg = (None, album_id, "", img_url, 1)
                try:
                    self.sql.insert_data(command, msg, commit=True)
                except Exception as e:
                    utils.log('insert photos data errors')

                print('created photos success')
Ejemplo n.º 2
0
Archivo: mm.py Proyecto: cheenwe/mm
class Crawler(object):
    def __init__(self):
        super(Crawler, self).__init__()
        self.album_prefix = 'https://mm.taobao.com/self/album/open_album_list.htm?_charset=utf-8&user_id%20={0}&page={1}'
        self.image_prefix = 'https://mm.taobao.com/album/json/get_album_photo_list.htm?user_id={0}&album_id={1}&page={2}'
        self.image_pattern = re.compile('''img.*290x10000.jpg''', re.U)
        self.image_name_pattern = re.compile('''"picId":"(.*?)"''', re.U)
        self.model_pattern = re.compile(
            '''<a class="lady-name" href="(.*?)".*>(.*?)</a>''', re.U)
        self.album_pattern = re.compile('''.*album_id=(.*?)&.*''', re.U)
        self.links = []
        self.ids = []
        self.names = []
        self.sql = SqlHelper()

    def readHtml(self, html):
        response = urllib2.urlopen(html)
        return response.read()

    def getLinkIdAndNames(self, htmlData):
        items = re.findall(self.model_pattern, htmlData)
        self.links = [link for link, name in items]
        self.names = [name.decode('gbk') for link, name in items]
        self.ids = [link[link.index('=') + 1:] for link in self.links]

    def getAlbums(self):
        for i, model_id in enumerate(self.ids):

            utils.log('start downloading:%s' % self.names[i])
            # print 'start downloading', self.names[i]

            # 插入用户
            command = self.sql.insert_data_to_users()
            msg = (
                model_id,
                self.names[i],
                "",
            )

            try:
                self.sql.insert_data(command, msg, commit=True)
            except Exception, e:
                utils.log('insert users data errors')

            for page in xrange(1, 10):
                utils.log('current page:%s' % page)
                # print 'current page', page

                model_url = self.album_prefix.format(model_id, page)
                soup = bs(self.readHtml(model_url), 'html.parser')
                albums = soup.find_all('div', class_='mm-photo-cell-middle')
                if not albums:
                    break
                for album in albums:
                    album_name = album.find('h4').a.string.strip().rstrip('.')
                    album_link = album.find('h4').a['href']
                    album_id = re.findall(self.album_pattern, album_link)[0]
                    album_create_time = album.find(
                        'p', class_='mm-photo-date').string.strip(
                            u'创建时间: ').strip(u'´´½¨Ê±¼ä:')
                    album_img_count = album.find(
                        'span', class_='mm-pic-number').string.strip(
                            '()').strip(u'张').strip(u'ÕÅ')

                    # print ">>>>>>>>>>>>>>>>>>>>>>"
                    # print album.find('p', class_ = 'mm-photo-date').string
                    # print album_create_time
                    # print ">>>>>>>>>>>>>>>>>>>>>>"

                    # 插入相册
                    command = self.sql.insert_data_to_albums()
                    msg = (album_id, model_id, album_name, album_create_time,
                           "", 1, album_img_count)
                    try:
                        self.sql.insert_data(command, msg, commit=True)
                    except Exception, e:
                        utils.log('insert albums data errors')

                    utils.log('start in album:%s, total size: %s' %
                              (album_name, album_img_count))

                    self.getImages(model_id, album_id, album_img_count)