def parse(self,response): # 设置编码方式为GBK response.encoding = 'GBK' # result=response.re_find_all('{channel\s*:\s*{title\s*:\s*"(.+?)",') print(response.url) return Item(response.url)
def parse_detail(self, response): # 获取上一个解析函数传递过来的数据 data = response.meta['data'] data['movie_length'] = response.xpath( '//span[@property="v:runtime"]/text()') # 返回结果 return Item(data)
def parse(self, response): '''解析豆瓣电影top250列表页''' title_list = [] # 存储所有的 for li in response.xpath("//ol[@class='grid_view']/li"): # 遍历每一个li标签 title = li.xpath( ".//span[@class='title'][1]/text()") # 提取该li标下的 标题 title_list.append(title[0]) yield Item(title_list)
def parse(self, response): # item = {} # item['title'] = response.xpath("//head/title/text()")[0] # yield Item(item) node_list = response.xpath("//div[@class='hd']")[:3] for node in node_list: item = {} item['page_title'] = node.xpath("./a/span/text()")[0] item['page_link'] = node.xpath("./a/@href")[0] # Item数据,交给管道 yield Item(item)
def parse(self, response): # item = {} # item['title'] = response.xpath("//head/title/text()")[0] # yield Item(item) node_list = response.xpath("//div[@class='hd']")[:3] for node in node_list: item = {} item['page_title'] = node.xpath("./a/span/text()")[0] item['page_link'] = node.xpath("./a/@href")[0] # Item数据,交给管道 yield Item(item) # Request对象,Engine发送,并由指定的回调函数parse_page解析 yield Request(item['page_link'], callback="parse_page")
def parse_page(self, response): print("[parse_page] : [{}] <{}>".format(response.status_code, response.url)) yield Item({})
def parse(self, response): item = {} item['title'] = response.xpath("//head/title/text()")[0] yield Item(item)
def parse_detail(self, response): data = response.meta['data'] data['movie_length'] = response.xpath( '//span[@property="v:runtime"]/text()') return Item(data)
def parse(self, response): return Item(response.url)