Example #1
0
    def __init__(self, *args, **kwargs):
        super(PansosoSpider3, self).__init__(*args, **kwargs)

        self.level3_file = settings.get('LEVEL3_FILE')
        self.level4_file = settings.get('LEVEL4_FILE')
        rm_file(self.level4_file)
        self.start_urls = read_file(self.level3_file)
Example #2
0
    def _prase_prepare(self):
        """ 解析前准备 """

        if self.mode == 'override':
            rm_file(self.level2_file)

        write_file(self.level1_file, self.start_urls, mode=self.mode)
        print("写入文件[%s]成功" % self.level1_file)
Example #3
0
    def __init__(self, mode='append', *args, **kwargs):
        super(DashengpanSpider2, self).__init__(*args, **kwargs)

        self.mode = mode
        self.level2_file = settings.get('LEVEL2_FILE')
        self.level3_file = settings.get('LEVEL3_FILE')
        if self.mode == 'override':
            rm_file(self.level3_file)
        self.start_urls = read_file(self.level2_file)
Example #4
0
    def _prase_prepare(self):
        """ 解析前准备 """

        self.level1_file = settings.get('LEVEL1_FILE')
        self.level2_file = settings.get('LEVEL2_FILE')
        rm_file(self.level1_file)
        rm_file(self.level2_file)

        write_file(self.level1_file, self.start_urls, mode='override')
        print("写入文件[%s]成功" % self.level1_file)
Example #5
0
    def __init__(self, *args, **kwargs):
        super(PansosoSpider2, self).__init__(*args, **kwargs)

        self.level2_file = settings.get('LEVEL2_FILE')
        self.level3_file = settings.get('LEVEL3_FILE')
        rm_file(self.level3_file)
        self.start_urls = read_file(self.level2_file)

        print('>>>>>>>>>>>>> 2')
        print(dir(self))
        print(dir(self.start_requests))
        print(self.start_requests)
Example #6
0
    def __init__(self, mode='append', *args, **kwargs):
        super(DashengpanSpider2, self).__init__(*args, **kwargs)

        self.mode = mode
        self.level2_file = settings.get('LEVEL2_FILE')
        self.result_file = settings.get('RESULT_FILE')
        if self.mode == 'override':
            rm_file(self.result_file)
        self.start_urls = read_file(self.level2_file)
        self.start_urls = [i for i in self.start_urls if i.startswith('http')]

        if settings.get("IS_USE_DELAY_LOAD_URL", False):
            # 延迟加载
            self.browser = webdriver.Chrome()
            self.browser.set_page_load_timeout(30)
Example #7
0
class Pansoso2Spider(scrapy.Spider):
    u""" 盘搜搜爬虫02 - 输入页面提取下载信息

    """
    name = "pansoso2"
    allowed_domains = ["www.pansoso.com"]
    settings = get_project_settings()

    level2_file = settings.get('LEVEL2_FILE')
    level3_file = settings.get('LEVEL3_FILE')
    rm_file(level3_file)
    start_urls = read_file(level2_file)
         
    def parse(self, response):
        if response.status == 200:
            selector = scrapy.Selector(response)
            # <div class="down">
            infos = selector.xpath('//div[@class="down"]')
            level3_urls = []
            for info in infos:
                hrefs = info.xpath('a/@href').extract()
                hrefs = [i for i in hrefs if '.html' not in i]
                href = hrefs[0]
                level3_urls.append(href)
            write_file(self.level3_file, level3_urls, mode='append')
Example #8
0
class PansosoSpiderThread(scrapy.Spider):
    u""" 盘搜搜爬虫03 - 提取百度云链接

    """
    name = "pansoso3"
    allowed_domains = ["www.pansoso.com"]

    settings = get_project_settings()
    level3_file = settings.get('LEVEL3_FILE')
    level4_file = settings.get('LEVEL4_FILE')
    rm_file(level4_file)
    start_urls = read_file(level3_file)

    def parse(self, response):
        time.sleep(0.5)
        if response.status == 200:
            selector = scrapy.Selector(response)

            infos = selector.xpath('//div[@class="file"]')
            level4_urls = []
            for info in infos:
                href = info.xpath('p/a/@href').extract()[0]
                print(href)
                level4_urls.append(href)
            write_file(self.level4_file, level4_urls, mode='append')
Example #9
0
    def __init__(self,
                 search_text='excel',
                 page=1,
                 mode='append',
                 *args,
                 **kwargs):
        u""" 指定爬虫参数
        exp:
            scrapy crawl myspider -a http_user=myuser -a http_pass=mypassword -a user_agent=mybot

        :param search_text: 搜索内容
        :param page: 查询页数
        :param mode: override / append 覆盖 or 追加

        """
        super(DashengpanSpider1, self).__init__(*args, **kwargs)

        page = int(page)
        self.mode = mode
        self.base_url = "https://www.dashengpan.com"
        self.level1_file = settings.get('LEVEL1_FILE')
        self.level2_file = settings.get('LEVEL2_FILE')

        print('>>>>>>>> @Spider_name: %s @search_text: %s @page: %s' %
              (self.name, search_text, page))
        start_url = 'https://www.dashengpan.com/search?keyword=%s' % (
            search_text)
        http_dic = {
            ' ': '%20',
        }
        for key, val in list(http_dic.items()):
            start_url = start_url.replace(key, val)
        self.start_urls = []
        for i in range(1, page + 1):
            self.start_urls.append(start_url + '&page=%s' % i)

        if self.mode == 'override':
            rm_file(self.level1_file)

        self._prase_prepare()