def parse(self, response): u""" 说明: 1. 大圣盘 自带链接校验机制 HTML中含 该链接有效,可以访问 ,说明链接有效 TODO 由于存在校验延后的问题, 此处未使用 """ level3_urls = [] print('>>>>>>>>' * 20) if response.status == 200: selector = scrapy.Selector(response) infos = selector.xpath('//div[@class="resource-meta"]') print(infos) print(dir(infos)) for info in infos: hrefs = info.xpath('@span').extract() print(hrefs) hrefs = [i for i in hrefs if '.html' not in i] href = hrefs[0] if '404.html' not in href: level3_urls.append(href) write_file(self.level3_file, level3_urls, mode='append') print("写入文件[%s]成功" % self.level3_file)
def parse(self, response, autorun=True): u""" 说明: 1. 由于此网站 此链接 会自动跳转百度网盘 所以直接获取 访问的url 即可 exp: 原链接: http://w.sbdp.hao7188.com/down.asp?id=109483377&token=803676c3cd6a8eb8d15a41ba79e85da1&bt=Excel%E5%A4%8D%E4%B9%A0.rar 自动跳转: https://yun.baidu.com/s/1c0b2Oec """ level3_urls = [] if response.status == 200: selector = scrapy.Selector(response) if autorun: href = response.url if '404.html' not in href and True not in [ i in href for i in self.allowed_domains ]: level3_urls.append(href) else: infos = selector.xpath('//p[@style="text-align:center;"]') for info in infos: hrefs = info.xpath('a/@href').extract() hrefs = [i for i in hrefs if '.html' not in i] href = hrefs[0] if '404.html' not in href: level3_urls.append(href) write_file(self.level3_file, level3_urls, mode='append') print("写入文件[%s]成功" % self.level3_file)
def _prase_prepare(self): """ 解析前准备 """ if self.mode == 'override': rm_file(self.level2_file) write_file(self.level1_file, self.start_urls, mode=self.mode) print("写入文件[%s]成功" % self.level1_file)
def _prase_prepare(self): """ 解析前准备 """ self.level1_file = settings.get('LEVEL1_FILE') self.level2_file = settings.get('LEVEL2_FILE') rm_file(self.level1_file) rm_file(self.level2_file) write_file(self.level1_file, self.start_urls, mode='override') print("写入文件[%s]成功" % self.level1_file)
def parse(self, response): if response.status == 200: selector = scrapy.Selector(response) # <div class="down"> infos = selector.xpath('//div[@class="down"]') level3_urls = [] for info in infos: hrefs = info.xpath('a/@href').extract() hrefs = [i for i in hrefs if '.html' not in i] href = hrefs[0] level3_urls.append(href) write_file(self.level3_file, level3_urls, mode='append')
def parse(self, response): time.sleep(0.5) if response.status == 200: selector = scrapy.Selector(response) infos = selector.xpath('//div[@class="file"]') level4_urls = [] for info in infos: href = info.xpath('p/a/@href').extract()[0] print(href) level4_urls.append(href) write_file(self.level4_file, level4_urls, mode='append')
def parse(self, response): if response.status == 200: selector = scrapy.Selector(response) infos = selector.xpath('//div[@class="resource-info"]') level2_urls = [] for info in infos: # item = scrapy.JsuserItem() href = self.base_url + info.xpath('h1/a/@href').extract()[0] level2_urls.append(href) SpiderItem.href = href write_file(self.level2_file, level2_urls, mode='append') print("写入文件[%s]成功" % self.level2_file)
def parse(self, response): self._prase_prepare() print('>>>>>>>>> panpan1 parse') if response.status == 200: selector = scrapy.Selector(response) # <div class="pss"> infos = selector.xpath('//div[@class="pss"]') level2_urls = [] for info in infos: # item = scrapy.JsuserItem() href = info.xpath('h2/a/@href').extract()[0] level2_urls.append(href) print(href) SpiderItem.href = href write_file(self.level2_file, level2_urls, mode='append') print("写入文件[%s]成功" % self.level2_file)
def parse(self, response): u""" 说明: 1. 大圣盘 自带链接校验机制 HTML中含 该链接有效,可以访问 ,说明链接有效 2. 反爬虫机制 1. 延迟加载 提取码HTML <div class="resource-meta" data-v-7b9e41d7=""> <span data-v-7b9e41d7="" class="meta-item copy-item"> <span data-v-7b9e41d7="" class="label">提取密码</span> h99f <span data-v-7b9e41d7="" class="copy">点击复制</span> </span> <span data-v-7b9e41d7="" class="tip">提取码复制成功</span> </div> """ result_url = [] get_code_flag = get_url_flag = True #if response.status == 200 and '该链接有效,可以访问' in response.text: if response.status == 200: selector = scrapy.Selector(response) code = '' try: # 提取码 if get_code_flag: code_infos = selector.xpath( '//div[@class="resource-meta"]') # code_info_str = str(code_infos.extract()[0], encoding='utf-8') if bool(code_infos.extract()): code_info_str = code_infos.extract()[0] #.decode() reg = ' \S{4} ' lis = re.findall(reg, code_info_str, re.I) lis = [ i.strip() for i in lis if i and len(i.strip()) == 4 ] code = lis[0] if bool(lis) else '' except Exception as err: print('>> 获取提取码失败!: \n%s' % err) print(traceback.format_exc()) try: # 链接 if get_url_flag: selector = scrapy.Selector(response) # url_infos = selector.xpath('//div[@class="button-inner baidu-button-inner"]') url_infos = selector.xpath('//div[@class="button-inner"]') print(url_infos) for url_info in url_infos: urls = url_info.xpath('a/@href').extract() urls = [ i for i in urls if '.html' not in i and 'baidu' in i ] if not bool(urls): continue url = urls[0] href = url + ' ' + code print('> 获取链接: %s 提取码: %s' % (url, code)) result_url.append(href) if bool(result_url): write_file(self.result_file, result_url, mode='append') print("写入文件[%s]成功" % self.result_file) except Exception as err: print('>> 获取链接失败!: \n%s' % err) print(traceback.format_exc()) else: print("该链接[%s]已失效 or 未使用延迟加载处理 IS_USE_DELAY_LOAD_URL" % response.url)