コード例 #1
0
    def parse(self, response):
        u"""

        说明:
            1. 大圣盘 自带链接校验机制 
                HTML中含 该链接有效,可以访问 ,说明链接有效 
                TODO 由于存在校验延后的问题, 此处未使用

        """
        level3_urls = []
        print('>>>>>>>>' * 20)
        if response.status == 200:
            selector = scrapy.Selector(response)
            infos = selector.xpath('//div[@class="resource-meta"]')
            print(infos)
            print(dir(infos))

            for info in infos:
                hrefs = info.xpath('@span').extract()
                print(hrefs)
                hrefs = [i for i in hrefs if '.html' not in i]
                href = hrefs[0]
                if '404.html' not in href:
                    level3_urls.append(href)

            write_file(self.level3_file, level3_urls, mode='append')
            print("写入文件[%s]成功" % self.level3_file)
コード例 #2
0
    def parse(self, response, autorun=True):
        u"""
        说明:
            1. 由于此网站 此链接 会自动跳转百度网盘
            所以直接获取 访问的url 即可
        exp:
            原链接: http://w.sbdp.hao7188.com/down.asp?id=109483377&token=803676c3cd6a8eb8d15a41ba79e85da1&bt=Excel%E5%A4%8D%E4%B9%A0.rar
            自动跳转: https://yun.baidu.com/s/1c0b2Oec

        """
        level3_urls = []

        if response.status == 200:
            selector = scrapy.Selector(response)

            if autorun:
                href = response.url
                if '404.html' not in href and True not in [
                        i in href for i in self.allowed_domains
                ]:
                    level3_urls.append(href)
            else:
                infos = selector.xpath('//p[@style="text-align:center;"]')

                for info in infos:
                    hrefs = info.xpath('a/@href').extract()
                    hrefs = [i for i in hrefs if '.html' not in i]
                    href = hrefs[0]
                    if '404.html' not in href:
                        level3_urls.append(href)

            write_file(self.level3_file, level3_urls, mode='append')
            print("写入文件[%s]成功" % self.level3_file)
コード例 #3
0
    def _prase_prepare(self):
        """ 解析前准备 """

        if self.mode == 'override':
            rm_file(self.level2_file)

        write_file(self.level1_file, self.start_urls, mode=self.mode)
        print("写入文件[%s]成功" % self.level1_file)
コード例 #4
0
    def _prase_prepare(self):
        """ 解析前准备 """

        self.level1_file = settings.get('LEVEL1_FILE')
        self.level2_file = settings.get('LEVEL2_FILE')
        rm_file(self.level1_file)
        rm_file(self.level2_file)

        write_file(self.level1_file, self.start_urls, mode='override')
        print("写入文件[%s]成功" % self.level1_file)
コード例 #5
0
ファイル: pansosoSpider2.py プロジェクト: wsws1/Spiders
 def parse(self, response):
     if response.status == 200:
         selector = scrapy.Selector(response)
         # <div class="down">
         infos = selector.xpath('//div[@class="down"]')
         level3_urls = []
         for info in infos:
             hrefs = info.xpath('a/@href').extract()
             hrefs = [i for i in hrefs if '.html' not in i]
             href = hrefs[0]
             level3_urls.append(href)
         write_file(self.level3_file, level3_urls, mode='append')
コード例 #6
0
    def parse(self, response):
        time.sleep(0.5)
        if response.status == 200:
            selector = scrapy.Selector(response)

            infos = selector.xpath('//div[@class="file"]')
            level4_urls = []
            for info in infos:
                href = info.xpath('p/a/@href').extract()[0]
                print(href)
                level4_urls.append(href)
            write_file(self.level4_file, level4_urls, mode='append')
コード例 #7
0
    def parse(self, response):

        if response.status == 200:
            selector = scrapy.Selector(response)
            infos = selector.xpath('//div[@class="resource-info"]')

            level2_urls = []
            for info in infos:
                # item = scrapy.JsuserItem()
                href = self.base_url + info.xpath('h1/a/@href').extract()[0]
                level2_urls.append(href)
                SpiderItem.href = href
            write_file(self.level2_file, level2_urls, mode='append')
            print("写入文件[%s]成功" % self.level2_file)
コード例 #8
0
    def parse(self, response):

        self._prase_prepare()

        print('>>>>>>>>> panpan1 parse')
        if response.status == 200:
            selector = scrapy.Selector(response)

            # <div class="pss">
            infos = selector.xpath('//div[@class="pss"]')
            level2_urls = []
            for info in infos:
                # item = scrapy.JsuserItem()
                href = info.xpath('h2/a/@href').extract()[0]
                level2_urls.append(href)
                print(href)
                SpiderItem.href = href
            write_file(self.level2_file, level2_urls, mode='append')
            print("写入文件[%s]成功" % self.level2_file)
コード例 #9
0
    def parse(self, response):
        u"""

        说明:
            1. 大圣盘 自带链接校验机制 
                HTML中含 该链接有效,可以访问 ,说明链接有效 
            2. 反爬虫机制
                1. 延迟加载 

        提取码HTML
            <div class="resource-meta" data-v-7b9e41d7="">
                <span data-v-7b9e41d7="" class="meta-item copy-item">
                    <span data-v-7b9e41d7="" class="label">提取密码</span> 
                    h99f 
                    <span data-v-7b9e41d7="" class="copy">点击复制</span>
                </span> 
                <span data-v-7b9e41d7="" class="tip">提取码复制成功</span>
            </div>
    
        """
        result_url = []
        get_code_flag = get_url_flag = True
        #if response.status == 200 and '该链接有效,可以访问' in response.text:
        if response.status == 200:
            selector = scrapy.Selector(response)
            code = ''

            try:
                # 提取码
                if get_code_flag:
                    code_infos = selector.xpath(
                        '//div[@class="resource-meta"]')
                    # code_info_str = str(code_infos.extract()[0], encoding='utf-8')
                    if bool(code_infos.extract()):
                        code_info_str = code_infos.extract()[0]  #.decode()

                        reg = '    \S{4} '
                        lis = re.findall(reg, code_info_str, re.I)
                        lis = [
                            i.strip() for i in lis if i and len(i.strip()) == 4
                        ]
                        code = lis[0] if bool(lis) else ''
            except Exception as err:
                print('>> 获取提取码失败!: \n%s' % err)
                print(traceback.format_exc())

            try:
                # 链接
                if get_url_flag:
                    selector = scrapy.Selector(response)
                    # url_infos = selector.xpath('//div[@class="button-inner baidu-button-inner"]')
                    url_infos = selector.xpath('//div[@class="button-inner"]')
                    print(url_infos)
                    for url_info in url_infos:
                        urls = url_info.xpath('a/@href').extract()
                        urls = [
                            i for i in urls
                            if '.html' not in i and 'baidu' in i
                        ]
                        if not bool(urls):
                            continue
                        url = urls[0]
                        href = url + ' ' + code
                        print('> 获取链接: %s 提取码: %s' % (url, code))
                        result_url.append(href)

                if bool(result_url):
                    write_file(self.result_file, result_url, mode='append')
                    print("写入文件[%s]成功" % self.result_file)
            except Exception as err:
                print('>> 获取链接失败!: \n%s' % err)
                print(traceback.format_exc())

        else:
            print("该链接[%s]已失效 or 未使用延迟加载处理 IS_USE_DELAY_LOAD_URL" %
                  response.url)