def parse(self, response): _response = response.response print(_response.status_code) soup = BeautifulSoup(_response.text, "html.parser") rows = soup.select('.dlm-downloads li') for item in rows: data = {} data['url'] = item.select_one('a').attrs['href'] data['title'] = item.select_one('a').text print(self.proxies) yield Request({ "url": data['url'], "proxies": self.proxies }, callback=self.detail_content, meta={'data': data}) # 下一页 next_url = "https://laedc.org" + soup.select_one(".next").attrs["href"] if next_url: print(next_url) print(self.proxies) yield Request({ "url": next_url, "proxies": self.proxies }, callback=self.parse) return
def parse(self, response): # 获取请求体 request = response.request print(request) # 获取原生requests模块的Response _response = response.response # _response.encoding = "utf8" s=Selector(response=_response) list=s.xpath(".//li") for item in list: data = {} data["url"]=urljoin(_response.url,item.xpath(".//a/@href").extract_first()) data["title"]=item.xpath(".//a/text()").extract_first() yield Request({"url": data['url']}, callback=self.detail_content, meta={"data": data}) # 下一页 next_url=urljoin(_response.url,s.xpath(".//a[contains(text(),'下一页')]/@href").extract_first()) if next_url: yield Request({"url": next_url}, callback=self.parse)
def parse(self, response): # 获取请求体 request = response.request _response = response.response soup = BeautifulSoup(_response.text, "html.parser") list = soup.select('ul.xinxi_ul li') for item in list: data = {} data["url"] = urljoin(_response.url, item.select_one("a").attrs["href"]) data["title"] = item.select_one("a").text yield Request({"url": data['url']}, callback=self.detail_content, meta={"data": data}) # 下一页 next_url = urljoin( _response.url, soup.find("a", text=re.compile("下一页")).attrs["href"]) print(next_url) if next_url: yield Request({"url": next_url}, callback=self.parse)
def start_requests(self): if self.debug: task_obj = { "id": 1, "url": "http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/index_1.html" } requestData = self.get_requests(task_obj) yield Request(requestData, meta={"task": task_obj}, callback=self.parse()) while 1: task_obj = self.get_task(obj=True) if not task_obj: logger.debug("没有任务") break url = task_obj.url # 下载 yield Request({"url": url}, meta={"task": task_obj}, callback=self.parse) return
def start_requests(self): """入口函数""" for i in range(0, 1): url = "https://laedc.org/research-analysis/search-reports/download-category/economic-impact-studies/" print(self.proxies) yield Request({"url": url, "proxies": self.proxies})
def start_requests(self): """入口函数""" for i in range(1, 2): url = "http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/index_1.html" yield Request(url)