def parse(self, response): ''' 默认处理start_url地址对应的响应 :param response: response对象 :return: item或者是request ''' yield Item(response.body)
def parse(self, response): """ 默认处理start_url地址对应的响应 :param response: :return: """ yield Item(response.body)
def parse(self, response): """ 解析响应对象 :param response: :return: """ return Item(response.url)
def parse_detail(self, response): item = response.meta['item'] item['stats-vote'] = response.xpath( "//span[@class='stats-vote']/i/text()") item['stats-vote'] = item['stats-vote'][0] if len( item['stats-vote']) > 0 else None yield Item(item)
def parse_detail(self, response): '''解析详情页''' item = response.meta["item"] item["url"] = response.url # print('item:', item) # 打印一下响应的url # return [] # 由于必须返回一个容器,这里返回一个空列表 yield Item(item) #或者yield Item对象
def parse_detail(self, response): """解析详情页数据""" data = response.meta['data'] # print(data) data['movie_length'] = response.xpath( '//span[@property="v:runtime"]/text()')[0] print(data) yield Item(data)
def parse(self, response): '''解析豆瓣电影top250列表页''' title_list = [] # 存储所有的 for li in response.xpath("//ol[@class='grid_view']/li"): # 遍历每一个li标签 title = li.xpath( ".//span[@class='title'][1]/text()") # 提取该li标下的 标题 title_list.append(title[0]) # title_list.apppend(title[0]) # 故意写错,发现程序卡死,没任何提示 yield Item(title_list) # 返回标题
def parse(self, response): divs = response.xpath('//*[@id="content"]/div/div[1]/ol/li') for div in divs: dic = {} #dic['url'] = response.url dic['name'] = div.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0] item = Item(dic) detail_url = div.xpath('./div/div[2]/div[1]/a/@href')[0] #yield item yield Request(detail_url, callback=self.parse_detail, meta={'item': item})
def parse(self, response): """对响应进行处理""" # 获取包含电影信息的li标签列表 lis = response.xpath('//*[@id="content"]/div/div[1]/ol/li') # 遍历lis获取每个电影名 for li in lis: dic = {'name': li.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0]} # print(dic) item = Item(dic) # 请求详情页,构造详情页的请求 # 1.准备详情的URL detail_url = li.xpath('./div/div[2]/div[1]/a/@href')[0] yield Request(detail_url, callback=self.parse_detail, meta={'item': item})
def parse(self, response): """ 解析响应对象 :param response: :return: """ a_s = response.xpath('//div[@class="hd"]/a') for a in a_s: data = {} data['movie_name'] = a.xpath('./span[1]/text()')[0] data['movie_url'] = a.xpath('./@href')[0] # print(data) # yield Item(data) # yield Request(data['movie_url'], callback=self.parse_detail, meta={'data': data} ) yield Item(data)
def parse(self, response): """提取页面的数据""" # 先分组,在提取数据 div_list = response.xpath("//div[@id='content-left']/div") for div in div_list[:1]: item = {} item["name"] = div.xpath(".//h2/text()")[0].strip() item["age"] = div.xpath( ".//div[contains(@class,'articleGender')]/text()") item["age"] = item["age"][0] if len(item["age"]) > 0 else None item["gender"] = div.xpath( ".//div[contains(@class,'articleGender')]/@class") # item["gender"] = item["gender"][0].split(' ')[-1].replace("Icon", "") if len(["gender"]) > 0 else None item["gender"] = item["gender"][0].split(" ")[-1].replace( "Icon", "") if len(item["gender"]) > 0 else None item["href"] = urllib.parse.urljoin(response.url, div.xpath("./a/@href")[0]) # print(item) yield Item(item) yield Request(item["href"], parse="parse_detail", meta={"item": item})
def parse(self, response): '''响应体数据是js代码''' # 使用js2py模块,执行js代码,获取数据 ret = js2py.eval_js( response.body.decode("gbk")) # 对网站分析发现,数据编码格式是gbk的,因此需要先进行解码 yield Item(ret.list)
def parse(self, response): return Item(response.body)
def parse(self, response): yield Item(response.xpath("//title/text()")[0])
def parse(self, response): ''' 解析请求 并返回新的请求对象、或者数据对象 ''' return Item(response.body) # 返回item对象
def parse_detail(self, response): item = response.meta['item'] item['stats-vote'] = response.xpath( "//span[@class='stats-vote']/i/text()")[0] yield Item(item)
def parse_page(self, response): """ 处理每个电影详情页的响应 """ yield Item(response.url)
def parse(self, response): title_list = response.xpath("//span[@class='title'][1]/text()") yield Item(title_list) """
def parse(self, response): for node in response.xpath("//div[@class='hd']"): title = node.xpath(".//span[@class='title'][1]/text()")[0] yield Item(title)
def parse(self, response): yield Item(response.body)
def parse(self, response): """ 解析响应 返回新的请求对象或者数据对象 """ return Item(response.body)
def parse(self, response): # 3.对响应数据进行解析,返回数据或新请求 item = Item(response.body) return item
def parse(self,response): """解析""" yield Item(response.body)
def parse(self, response): title = response.xpath("//title/text()")[0] yield Item(title)
def parse_detail(self, response): """详情页响应函数""" item = response.meta["item"] item["stats_vote"] = response.xpath( "//span[@class='stats-vote']/i/text()")[0] yield Item(item)
def parse(self, response): yield Item(response.url)
def parse_detail(self,response): item = response.meta.get('item') item['movie_starring'] = response.xpath('//*[@id="info"]/span[10]/text()') yield Item(item)