def parser(self, request, response): # 解析列表 article_list_url = response.xpath( '//div[@class="kjxw_tit"]/a/@href').extract() for url in article_list_url: log.debug("下发文章任务 url = {}".format(url)) yield spider.Request(url, callback=self.parser_artile)
def parser(self, request, response): for link in response.xpath('//ul[@class="branch_list_ul paging"]//a'): title = link.xpath("./text()").extract_first() url = link.xpath("./@href").extract_first() print("采集到列表 {} {}".format(title, url)) yield spider.Request(url, title=title, callback=self.parser_detail)
def start_requests(self, *args, **kws): yield spider.Request("http://www.bj.chinanews.com/focus/1.html")
def start_requests(self, *args, **kws): yield spider.Request("https://www.baidu.com")
def start_requests(self, task): # task 为在任务表中取出的每一条任务 id, url = task # id, url为所取的字段,main函数中指定的 yield spider.Request(url, task_id=id)
def start_requests(self, *args, **kws): # 下发列表任务 yield spider.Request("http://column.caijing.com.cn/")
def start_requests(self, *args, **kws): yield spider.Request("https://cn.bing.com/?mkt=zh-CN")