Ejemplo n.º 1
0
class Pansoso2Spider(scrapy.Spider):
    u""" 盘搜搜爬虫02 - 输入页面提取下载信息

    """
    name = "pansoso2"
    allowed_domains = ["www.pansoso.com"]
    settings = get_project_settings()

    level2_file = settings.get('LEVEL2_FILE')
    level3_file = settings.get('LEVEL3_FILE')
    rm_file(level3_file)
    start_urls = read_file(level2_file)
         
    def parse(self, response):
        if response.status == 200:
            selector = scrapy.Selector(response)
            # <div class="down">
            infos = selector.xpath('//div[@class="down"]')
            level3_urls = []
            for info in infos:
                hrefs = info.xpath('a/@href').extract()
                hrefs = [i for i in hrefs if '.html' not in i]
                href = hrefs[0]
                level3_urls.append(href)
            write_file(self.level3_file, level3_urls, mode='append')
Ejemplo n.º 2
0
    def __init__(self, *args, **kwargs):
        super(PansosoSpider3, self).__init__(*args, **kwargs)

        self.level3_file = settings.get('LEVEL3_FILE')
        self.level4_file = settings.get('LEVEL4_FILE')
        rm_file(self.level4_file)
        self.start_urls = read_file(self.level3_file)
Ejemplo n.º 3
0
class PansosoSpiderThread(scrapy.Spider):
    u""" 盘搜搜爬虫03 - 提取百度云链接

    """
    name = "pansoso3"
    allowed_domains = ["www.pansoso.com"]

    settings = get_project_settings()
    level3_file = settings.get('LEVEL3_FILE')
    level4_file = settings.get('LEVEL4_FILE')
    rm_file(level4_file)
    start_urls = read_file(level3_file)

    def parse(self, response):
        time.sleep(0.5)
        if response.status == 200:
            selector = scrapy.Selector(response)

            infos = selector.xpath('//div[@class="file"]')
            level4_urls = []
            for info in infos:
                href = info.xpath('p/a/@href').extract()[0]
                print(href)
                level4_urls.append(href)
            write_file(self.level4_file, level4_urls, mode='append')
Ejemplo n.º 4
0
    def __init__(self, mode='append', *args, **kwargs):
        super(DashengpanSpider2, self).__init__(*args, **kwargs)

        self.mode = mode
        self.level2_file = settings.get('LEVEL2_FILE')
        self.level3_file = settings.get('LEVEL3_FILE')
        if self.mode == 'override':
            rm_file(self.level3_file)
        self.start_urls = read_file(self.level2_file)
Ejemplo n.º 5
0
    def __init__(self, *args, **kwargs):
        super(PansosoSpider2, self).__init__(*args, **kwargs)

        self.level2_file = settings.get('LEVEL2_FILE')
        self.level3_file = settings.get('LEVEL3_FILE')
        rm_file(self.level3_file)
        self.start_urls = read_file(self.level2_file)

        print('>>>>>>>>>>>>> 2')
        print(dir(self))
        print(dir(self.start_requests))
        print(self.start_requests)
Ejemplo n.º 6
0
    def __init__(self, mode='append', *args, **kwargs):
        super(DashengpanSpider2, self).__init__(*args, **kwargs)

        self.mode = mode
        self.level2_file = settings.get('LEVEL2_FILE')
        self.result_file = settings.get('RESULT_FILE')
        if self.mode == 'override':
            rm_file(self.result_file)
        self.start_urls = read_file(self.level2_file)
        self.start_urls = [i for i in self.start_urls if i.startswith('http')]

        if settings.get("IS_USE_DELAY_LOAD_URL", False):
            # 延迟加载
            self.browser = webdriver.Chrome()
            self.browser.set_page_load_timeout(30)