Example #1
0
class QiuBaiSpider():
    def __init__(self):
        self.db = DBHelper()
        ssl._create_default_https_context = ssl._create_unverified_context
        print('爬虫开始行动了……')

    def __del__(self):
        print('感谢有你,我要走了……')
        self.db.close()

    def request(self, url):
        # 创建opener浏览器对象,并且设置代理处理器
        opener = request.build_opener(request.ProxyHandler(proxies={'http': random.choice(settings.proxies['http'])}))
        req = request.Request(url, headers=settings.headers)
        resp = opener.open(req)
        if resp.status == 200:
            print('ok')
            html = resp.read().decode()
            # print(html)
            return html

    def parse(self, html):
        # 创建一个et对象
        et = etree.HTML(html)
        # print(et)
        authors = et.xpath(settings.author_path)
        # print(authors)
        for author in authors:  # author 的类型为 <class 'lxml.etree._Element'>
            # print(author)
            # print(type(author))
            try:
                # try不存在作用域
                home = author.xpath(settings.home_path)[0]  # './a/@href'
                id = home.split('/')[-2]  # /users/38248088/
                name = author.xpath(settings.name_path)[0]
                age = author.xpath(settings.age_path)[0]
                img = 'http:' + author.xpath(settings.src_path)[0].split('?')[0]
                # print(home)
            except:
                pass
            else:
                item = UserItem(id, name, age, img, home)
                # print(item)

                # 将数据存放到数据库
                self.db.save(item)
                self.saveImg(img, id)

        # 读取下一页的链接
        try:
            next_url = settings.start_url + et.xpath(settings.next_page_path)[0]
            print(next_url)
        except:
            pass
        else:
            return next_url

    def saveImg(self, url, id):

        filename = './head/{}.{}'.format(id, url.split('.')[-1])
        if os.path.exists(filename):
            return
        request.urlretrieve(url, filename=filename)
        print(filename, '图片下载成功!!!')

    def run(self):
        next_url = settings.start_url
        while True:
            html = self.request(next_url)

            # 解析网页并获取下一次请求的路径
            next_url = self.parse(html)

            if not next_url:
                break
Example #2
0
# -*-coding:utf-8-*-

from db import DBHelper
from item import UserItem

# 测试案例
db_ = DBHelper()
print(db_.exist('user', 1))

item = UserItem(1, 'lili', '14', 'xiha', 'http://')
db_.save()