def parse(self, response: Response): selects_detail_urls = response.xpath( '//*[@class="list"]//li//a/@href').getall() if len(selects_detail_urls) > 0: for detail_url in selects_detail_urls: yield Request(url=detail_url, callback=self.parse_detail) next = response.xpath('//*[@id="div_currpage"]//a[text()="下一页"]') if next: next_url = response.xpath( '//*[@id="div_currpage"]//a[text()="下一页"]/@href').get() yield Request(url=next_url, callback=self.parse)
def test_response(self): request = Request( "http://www.ocpe.com.cn/nengyuanjingji/zf/2020-08-29/3785.html") request.encoding = "utf-8" with open("test.html", "rb") as f: data = f.read() # data='{"code":200,"msg":"success","data":{"SYS_NAME":"职业院校综合管理与内部质量诊断与改进平台","LOGO":"/dfs/2020/11/05/20201105143318-47f7d6e3ca5637b23468330cab0ec0f3.jpg","BACKGROUND":"/dfs/2020/10/19/20201019194723-cdec006aef30e1e8313ac25eb2b71e38.png"}}' response = Response(data, 200, request) res = response.xpath("//div[@class='xwt_a']//a/text()").getall() print(res) print(response.links())
def parse(self, response: Response): # print('#######2') print(f'#######{self.name}') # print(threading.current_thread().name, "runing...", self.name) print(response.status) print("222222222222222") yield Request(url=response.url, callback=self.parse2, dont_filter=True)
def start_requests(self): for i in range(1): yield Request( url="http://search.51job.com", callback=self.parse, dont_filter=True, )
def start_requests(self): for page in range(100): url = f'http://exercise.kingname.info/exercise_middleware_ip/{page}' yield Request(url, callback=self.parse, dont_filter=False, timeout=3)
def start_requests(self): """ 初始请求 用户可以重写此方法 :return: """ for url in self.start_urls: yield Request(url=url, callback=self.parse)
async def fetch(url): try: req = Request(url=url) res = await AioHttpDown().fetch(req) return res except Exception as e: print(e) pass
def start_requests(self): for page in range(1000): url = f'http://exercise.kingname.info/exercise_middleware_ip/{page}' # url = f'http://exercise.kingname.info/exercise_middleware_ip/{page}' # url = 'http://fzggw.zj.gov.cn/art/2020/8/26/art_1621004_55344873.html' url = 'https://s.bdstatic.com/common/openjs/amd/eslx.js' yield Request(url, callback=self.parse, dont_filter=True, timeout=3)
def parse(self, response: Response): print("run parse...........................................") print(response.status) res = response.xpath("/html/body//div/ul[@class='list']/li/a/@href") getall = res.getall() for _url in getall: yield Request(url=_url, callback=self.parse_detail, dont_filter=True) print(22222222222222222222222222222222222222) print(3333)
def start_requests(self): for page in range(1122): # url = f'http://exercise.kingname.info/exercise_middleware_ip/{page}' url = 'https://s.bdstatic.com/common/openjs/amd/eslx.js' yield Request(url, callback=self.parse, dont_filter=True)
def start_requests(self): for url in self.start_urls: yield Request(url=url, callback=self.parse)
def parse(self, response: Response): for i in range(300): yield Request(url=response.url, dont_filter=True) yield from BidItem.get_items(response.text)
def start_requests(self): start_urls = [ "https://www.jianshu.com/p/e8f7f6c82be6" for i in range(30) ] for url in start_urls: yield Request(url=url, callback=self.parse, dont_filter=True)