Esempio n. 1
0
    def parse(self, response):
        _response = response.response
        print(_response.status_code)
        soup = BeautifulSoup(_response.text, "html.parser")
        rows = soup.select('.dlm-downloads li')
        for item in rows:
            data = {}
            data['url'] = item.select_one('a').attrs['href']
            data['title'] = item.select_one('a').text
            print(self.proxies)
            yield Request({
                "url": data['url'],
                "proxies": self.proxies
            },
                          callback=self.detail_content,
                          meta={'data': data})
        # 下一页
        next_url = "https://laedc.org" + soup.select_one(".next").attrs["href"]
        if next_url:
            print(next_url)
            print(self.proxies)
            yield Request({
                "url": next_url,
                "proxies": self.proxies
            },
                          callback=self.parse)

        return
Esempio n. 2
0
 def parse(self, response):
     # 获取请求体
     request = response.request
     print(request)
     # 获取原生requests模块的Response
     _response = response.response
     # _response.encoding = "utf8"
     s=Selector(response=_response)
     list=s.xpath(".//li")
     for item in list:
         data = {}
         data["url"]=urljoin(_response.url,item.xpath(".//a/@href").extract_first())
         data["title"]=item.xpath(".//a/text()").extract_first()
         yield Request({"url": data['url']}, callback=self.detail_content, meta={"data": data})
     # 下一页
     next_url=urljoin(_response.url,s.xpath(".//a[contains(text(),'下一页')]/@href").extract_first())
     if next_url:
         yield Request({"url": next_url}, callback=self.parse)
Esempio n. 3
0
 def parse(self, response):
     # 获取请求体
     request = response.request
     _response = response.response
     soup = BeautifulSoup(_response.text, "html.parser")
     list = soup.select('ul.xinxi_ul li')
     for item in list:
         data = {}
         data["url"] = urljoin(_response.url,
                               item.select_one("a").attrs["href"])
         data["title"] = item.select_one("a").text
         yield Request({"url": data['url']},
                       callback=self.detail_content,
                       meta={"data": data})
     # 下一页
     next_url = urljoin(
         _response.url,
         soup.find("a", text=re.compile("下一页")).attrs["href"])
     print(next_url)
     if next_url:
         yield Request({"url": next_url}, callback=self.parse)
Esempio n. 4
0
 def start_requests(self):
     if self.debug:
         task_obj = {
             "id": 1,
             "url":
             "http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/index_1.html"
         }
         requestData = self.get_requests(task_obj)
         yield Request(requestData,
                       meta={"task": task_obj},
                       callback=self.parse())
     while 1:
         task_obj = self.get_task(obj=True)
         if not task_obj:
             logger.debug("没有任务")
             break
         url = task_obj.url
         # 下载
         yield Request({"url": url},
                       meta={"task": task_obj},
                       callback=self.parse)
     return
Esempio n. 5
0
 def start_requests(self):
     """入口函数"""
     for i in range(0, 1):
         url = "https://laedc.org/research-analysis/search-reports/download-category/economic-impact-studies/"
         print(self.proxies)
         yield Request({"url": url, "proxies": self.proxies})
Esempio n. 6
0
 def start_requests(self):
     """入口函数"""
     for i in range(1, 2):
         url = "http://www.ccgp-beijing.gov.cn/xxgg/sjzfcggg/index_1.html"
         yield Request(url)