def test(): spider = Spider.get_cache_spider() spider.sleeper = ls.spider.NoSleeper spider.encoding = 'gbk' keyword = 'javascrsdfsip' base_url = 'https://www.baidu.com/?wd={}' url = base_url.format(keyword) resp = spider.get(url) resp.encoding = 'gbk' print(resp) resp = spider.get(url) resp.encoding = 'utf8' print(resp) resp = spider.get(url) print(resp) spider.encoding = 'utf8' resp = spider.get(url) print(resp) spider.encoding = None resp = spider.get(url) print(resp)
def __init__(self, spider=None): if not spider: spider = Spider() self.spider = spider self.queue = deque() self.items = [] self.queue += self.start_urls self.item_middlewares = [generic_item_middleware] self.request_middlewares = [generic_request_middleware]
def __init__(self, proxy_pool: ProxyPoolBase, spider=None): if not spider: spider = Spider() self.spider = spider self.queue = deque() self.items = [] self.queue += self.start_urls self.proxy_pool = proxy_pool self.item_middlewares = [generic_item_middleware] self.request_middlewares = [generic_request_middleware]
from lazy_spider import Spider from lazy_spider.utils import get_logger logger = get_logger() # spider = Spider() spider = Spider.get_cache_spider() spider.encoding = 'gb2313' result = spider.get('http://www.baidu.com') print(result) logger.debug('wdnmd')
""" generic_template """ import logging from peewee import * from lazy_spider import ResourceRoot from lazy_spider import Spider spider = Spider() logger = logging.getLogger('lazy_spider') res = ResourceRoot('resources') db = SqliteDatabase('db.sqlite') class MyModel(Model): url = CharField() data = TextField() class Meta: database = db if __name__ == '__main__': r = spider.get('https://www.baidu.com/') r.encoding = 'gb2313' result = r.css('title')[0] logger.info(result) res.close()
import logging import sys # 把父级目录添加到 `path` 中 # 不然无法导入此项目 sys.path.append('../') # 导入此项目 from lazy_spider import Spider, ResourceRoot # 实例化一个 `Spider` 类 # 这是你与网络的接口类, 也是此模块的主要类 spider = Spider() spider.set_sep_sleeper() # 模块内置的 `logger` 你可以尝试用 `logger.debug` 来代替 `print` 这会更加美观 logger = logging.getLogger('lazy_spider') # 模块里面的管理资源文件的类, 它把一个资源文件夹抽象成了一个类 res = ResourceRoot('resources') spider.headers_generator = lambda x: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55' } if __name__ == '__main__': url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start={}' all_data = {'data': []} for each in range(0, 1000, 20): logger.info('url: {}', url.format(each)) # 把爬虫获得的api转化成json存入资源文件夹下的 `douban.json` 文件中 # res['douban.json'] = lazy_spider.get(url.format(each)).json
import logging import time from lazy_spider import Spider, ResourceRoot spider = Spider() logger = logging.getLogger('lazy_spider') res = ResourceRoot('resources/imgs') def hg(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57', 'referer': 'https://setu.awsl.ee/setu!' } return headers spider.headers_generator = hg if __name__ == "__main__": for i in range(10, 20): tmp = '{:13.13}'.format(time.time()).replace('.', '') res[str(i) + '.jpg'] = spider.get('https://setu.awsl.ee/api/setu!?w=' + tmp, cache=Spider.DISABLE_CACHE).content time.sleep(0.1) # f = res['1.jpg'] # tmp = '{:13.13}'.format(time.time()).replace('.', '') # r = lazy_spider.get('https://setu.awsl.ee/api/setu!?w=' + tmp,
import os import sys sys.path.append(os.pardir) import logging from lazy_spider import Spider from lazy_spider.utils import ResourceRoot spider = Spider() logger = logging.getLogger('lazy_spider') res = ResourceRoot('resources/imgs') class TestResponse: def test_css(self): r = spider.get('https://www.baidu.com/') r.encoding = 'gb2313' result = r.css('title')[0] assert result.text == '百度一下,你就知道' class TestSpider: def test_resource(self): logger.debug('list_dir: {}', res.list_dir) logger.debug('files: {}', str(res.files)) logger.debug('dirs: {}', str(res.dirs)) logger.debug('root_dir: {}', str(res.root_dir)) res['hello'] = 'Hello, World.' hello = res['hello']