Ejemplo n.º 1
0
def test():
    spider = Spider.get_cache_spider()
    spider.sleeper = ls.spider.NoSleeper
    spider.encoding = 'gbk'

    keyword = 'javascrsdfsip'
    base_url = 'https://www.baidu.com/?wd={}'
    url = base_url.format(keyword)

    resp = spider.get(url)
    resp.encoding = 'gbk'
    print(resp)

    resp = spider.get(url)
    resp.encoding = 'utf8'
    print(resp)

    resp = spider.get(url)
    print(resp)

    spider.encoding = 'utf8'
    resp = spider.get(url)
    print(resp)

    spider.encoding = None
    resp = spider.get(url)
    print(resp)
Ejemplo n.º 2
0
    def __init__(self, spider=None):
        if not spider:
            spider = Spider()

        self.spider = spider
        self.queue = deque()
        self.items = []
        self.queue += self.start_urls

        self.item_middlewares = [generic_item_middleware]
        self.request_middlewares = [generic_request_middleware]
Ejemplo n.º 3
0
    def __init__(self, proxy_pool: ProxyPoolBase, spider=None):
        if not spider:
            spider = Spider()

        self.spider = spider
        self.queue = deque()
        self.items = []
        self.queue += self.start_urls
        self.proxy_pool = proxy_pool

        self.item_middlewares = [generic_item_middleware]
        self.request_middlewares = [generic_request_middleware]
Ejemplo n.º 4
0
from lazy_spider import Spider
from lazy_spider.utils import get_logger

logger = get_logger()

# spider = Spider()
spider = Spider.get_cache_spider()

spider.encoding = 'gb2313'
result = spider.get('http://www.baidu.com')

print(result)
logger.debug('wdnmd')
Ejemplo n.º 5
0
"""
generic_template
"""

import logging

from peewee import *

from lazy_spider import ResourceRoot
from lazy_spider import Spider

spider = Spider()
logger = logging.getLogger('lazy_spider')
res = ResourceRoot('resources')
db = SqliteDatabase('db.sqlite')


class MyModel(Model):
    url = CharField()
    data = TextField()

    class Meta:
        database = db


if __name__ == '__main__':
    r = spider.get('https://www.baidu.com/')
    r.encoding = 'gb2313'
    result = r.css('title')[0]
    logger.info(result)
    res.close()
Ejemplo n.º 6
0
import logging
import sys

# 把父级目录添加到 `path` 中
# 不然无法导入此项目
sys.path.append('../')
# 导入此项目
from lazy_spider import Spider, ResourceRoot

# 实例化一个 `Spider` 类
# 这是你与网络的接口类, 也是此模块的主要类
spider = Spider()
spider.set_sep_sleeper()
# 模块内置的 `logger` 你可以尝试用 `logger.debug` 来代替 `print` 这会更加美观
logger = logging.getLogger('lazy_spider')
# 模块里面的管理资源文件的类, 它把一个资源文件夹抽象成了一个类
res = ResourceRoot('resources')

spider.headers_generator = lambda x: {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36 Edg/87.0.664.55'
}

if __name__ == '__main__':
    url = 'https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=&start={}'
    all_data = {'data': []}
    for each in range(0, 1000, 20):
        logger.info('url: {}', url.format(each))

        # 把爬虫获得的api转化成json存入资源文件夹下的 `douban.json` 文件中
        # res['douban.json'] = lazy_spider.get(url.format(each)).json
Ejemplo n.º 7
0
import logging
import time

from lazy_spider import Spider, ResourceRoot

spider = Spider()
logger = logging.getLogger('lazy_spider')
res = ResourceRoot('resources/imgs')


def hg():
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 Edg/87.0.664.57',
        'referer': 'https://setu.awsl.ee/setu!'
    }
    return headers


spider.headers_generator = hg

if __name__ == "__main__":
    for i in range(10, 20):
        tmp = '{:13.13}'.format(time.time()).replace('.', '')
        res[str(i) + '.jpg'] = spider.get('https://setu.awsl.ee/api/setu!?w=' +
                                          tmp,
                                          cache=Spider.DISABLE_CACHE).content
        time.sleep(0.1)
    # f = res['1.jpg']
    # tmp = '{:13.13}'.format(time.time()).replace('.', '')
    # r = lazy_spider.get('https://setu.awsl.ee/api/setu!?w=' + tmp,
Ejemplo n.º 8
0
import os
import sys

sys.path.append(os.pardir)

import logging

from lazy_spider import Spider
from lazy_spider.utils import ResourceRoot

spider = Spider()
logger = logging.getLogger('lazy_spider')
res = ResourceRoot('resources/imgs')


class TestResponse:
    def test_css(self):
        r = spider.get('https://www.baidu.com/')
        r.encoding = 'gb2313'
        result = r.css('title')[0]
        assert result.text == '百度一下,你就知道'


class TestSpider:
    def test_resource(self):
        logger.debug('list_dir: {}', res.list_dir)
        logger.debug('files: {}', str(res.files))
        logger.debug('dirs: {}', str(res.dirs))
        logger.debug('root_dir: {}', str(res.root_dir))
        res['hello'] = 'Hello, World.'
        hello = res['hello']