def parse(self, response): # 对start_urls进行解析 # with open('error.html', 'w') as f: # f.write(response.body.decode()) print(response.url + '*****') tr_list = response.xpath('//*[@class="tablelist"]//tr')[1:-1] print(len(tr_list)) for tr in tr_list: item = {} # 获取一部分数据 item['name'] = tr.xpath('./td[1]/a/text()')[0] item['address'] = tr.xpath('./td[4]/text()')[0] item['time'] = tr.xpath('./td[5]/text()')[0] # 获取详情页url,并发送请求 detail_url = 'https://hr.tencent.com/' + tr.xpath('./td[1]/a/@href')[0] print(detail_url) yield Request( detail_url, parse='parse_detail', meta=item # meta接收一个字典 ) # 翻页 print(response.xpath('//a[text()="下一页"]/@href')[0]) next_url = 'https://hr.tencent.com/' + response.xpath('//a[text()="下一页"]/@href')[0] if response.xpath('//a[text()="下一页"]/@href')[0] != 'javascript:;': yield Request(next_url, parse='parse')
def start_request(self): # 发送start_urls中url地址的请求 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36" } url_temp = "https://movie.douban.com/top250?start={}&filter=" for i in [ page*25 for page in range(10)]: yield Request(url_temp.format(i),headers=headers)
def start_requests(self): # 重写start_requests方法,返回多个请求 # base_url = 'http://movie.douban.com/top250?start=' url = 'http://www.heimahui.club:8081/?a=' for i in range(100): # 逐个返回第1-10页的请求属相 url = url + str(i) yield Request(url)
def start_request(self): """ 构造请求对象 :return: """ for url in self.start_urls: yield Request(url)
def start_requests(self): start_urls = [ "https://movie.douban.com/top250?start=" + str(page) for page in range(0, 226, 25) ] for start_url in start_urls: yield Request(start_url, headers=self.headers)
def start_requests(self): # request_list = [] # for start_url in self.start_urls: # request_list.append(Request(start_url)) # return request_list # 将start_request处理为生成器,在Engine哪里进行迭代取出每个请求 for start_url in self.start_urls: yield Request(start_url)
def parse(self, response): title_list = [] for li in response.xpath("//ol[@class='grid_view']/li"): # title = li.xpath(".//span[@class='title'][1]/text()") # 提取该li标下的 标题 # title_list.append(title[0]) detail_url = li.xpath( ".//div[@class='info']/div[@class='hd']/a/@href")[0] yield Request(url=detail_url, parse='parse_detail')
def start_requests(self): url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php' while True: yield Request(url, dont_filter=True) # 一定要等待一定要再yield后面, 否则会导致程序卡死了 time.sleep(10)
def parse(self, response): self.total += 1 time.sleep(2) if self.total > 10: return yield Request( 'https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page=1', filter=False, parse='parse')
def parse(self, response): print(response.body) '''响应体数据是js代码''' # 使用js2py模块,执行js代码,获取数据 ret = js2py.eval_js( response.body.decode("gbk")) # 对网站分析发现,数据编码格式是gbk的,因此需要先进行解码 for news in ret.list: # yield Request(news["url"], headers=self.headers, parse='parse_detail', meta={"type": news["channel"]["title"]})
def parse(self, response): divs = response.xpath('//*[@id="content"]/div/div[1]/ol/li') for div in divs: dic = {} #dic['url'] = response.url dic['name'] = div.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0] item = Item(dic) detail_url = div.xpath('./div/div[2]/div[1]/a/@href')[0] #yield item yield Request(detail_url, callback=self.parse_detail, meta={'item': item})
def parse(self, response): '''解析豆瓣电影top250列表页''' # yield {'i':'同意!'} for li in response.xpath("//ol[@class='grid_view']/li"): # 遍历每一个li标签 item = {} item["title"] = li.xpath(".//span[@class='title'][1]/text()")[0] # 提取该li标下的 标题 # print(item) # yield item detail_url = li.xpath(".//div[@class='info']/div[@class='hd']/a/@href")[0] yield Request(detail_url, parse="parse_detail", meta={"item":item}) # 发起详情页的请求,并指定解析函数是parse_detail方法
def parse(self, response): a_s = response.xpath( '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a') for a in a_s: data = {} data['movie_name'] = a.xpath('./span[1]/text()')[0] data['movie_url'] = a.xpath('./@href')[0] # print(data) # yield Item(data) yield Request(data['movie_url'], callback=self.parse_detail, meta={'data': data})
def parse(self,response): # 提取页面的数据 li_list = response.xpath('//*[@id="content"]/div/div[1]/ol/li') headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", "Host": "movie.douban.com" } for li in li_list[0:1]: item = {} item['movie_name'] = li.xpath('.//div/div[2]/div[1]/a/span[1]/text()')[0] item['movie_actor'] = li.xpath('.//div/div[2]/div[2]/p[1]/text()')[0] item['detail_url'] = li.xpath('.//div/div[2]/div[1]/a/@href')[0] print(item['detail_url']) yield Request(item['detail_url'],parse='parse_detail',meta={'item':item},headers=headers)
def parse(self, response): # item = {} # item['title'] = response.xpath("//head/title/text()")[0] # yield Item(item) node_list = response.xpath("//div[@class='hd']")[:3] for node in node_list: item = {} item['page_title'] = node.xpath("./a/span/text()")[0] item['page_link'] = node.xpath("./a/@href")[0] # Item数据,交给管道 yield Item(item) # Request对象,Engine发送,并由指定的回调函数parse_page解析 yield Request(item['page_link'], callback="parse_page")
def parse(self, response): # 提取a标签的列表 a_s = response.xpath( '//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a') for a in a_s: data = {} data['movie_name'] = a.xpath('./span[1]/text()')[0] data['movie_url'] = a.xpath('./@href')[0] # yield Item(data) #2.1.2-1: 构建详情页的请求交给引擎 yield Request(data['movie_url'], callback=self.parse_detail, meta={'data': data}, headers=self.headers)
def parse(self, response): node_list = response.xpath("//div[@class='hd']") for node in node_list: data = {} # 电影标题 data['title'] = node.xpath("./a/span[1]/text()")[0] # 详情页链接 data['url'] = node.xpath("./a/@href")[0] # 返回Item对象和Request对象给引擎 # 如果是Item对象交给管道处理 # 如果是Request对象,通过getattr获取解析方法处理对应的响应 yield Item(data) yield Request(data['url'], callback="parse_page")
def start_requests(self): # 准备URL url = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php' # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Referer': 'http://roll.news.sina.com.cn/s/channel.php?ch=01' } while True: # 构建请求 # 默认所有请求只要重复了, 就会过滤掉 # 如果请求指定了dont_filter=True,该请求就不过滤 # 1. 修改框架中Request, 接收dont_filter # 2. 修改框架中调度器, 添加请求的时候, 判断这个请求是否需要过滤 yield Request(url, headers=headers, dont_filter=True) # 休息一下: 一定要yield后面, 协程切换后才睡, 否则到程序卡该位置 time.sleep(2)
def parse(self, response): '''解析豆瓣电影top250列表页''' # title_list = [] # 存储所有的 for li in response.xpath("//ol[@class='grid_view']/li"): # 遍历每一个li标签 item = {} item["title"] = li.xpath(".//span[@class='title'][1]/text()")[ 0] # 提取该li标下的 标题 # title_list.append(title[0]) detail_url = li.xpath( ".//div[@class='info']/div[@class='hd']/a/@href")[0] self.page += 1 if self.page > 2: break yield Request(detail_url, parse="parse_detail", meta={"item": item}) # 发起详情页的请求,并指定解析函数是parse_detail方法
def parse(self, response): # 提取页面数据 # 先分组,再提取 div_list = response.xpath("//div[@id='content-left']/div") for div in div_list[:1]: item = {} item['name'] = div.xpath(".//h2/text()")[0].strip() item['age'] = div.xpath( ".//div[contains(@class,'articleGender')]/text()") item['age'] = item['age'][0] if len(item['age']) > 0 else None item['gender'] = div.xpath( ".//div[contains(@class,'articleGender')]/@class") item['gender'] = item['gender'][0].split(' ')[-1].replace( 'Icon', '') if len(item['gender']) > 0 else None item['href'] = urllib.parse.urljoin(response.url, div.xpath("./a/@href")[0]) # yield Item(item) yield Request(item['href'], parse='parse_detail', meta={'item': item})
def parse(self, response): """提取页面的数据""" # 先分组,在提取数据 div_list = response.xpath("//div[@id='content-left']/div") for div in div_list[:1]: item = {} item["name"] = div.xpath(".//h2/text()")[0].strip() item["age"] = div.xpath( ".//div[contains(@class,'articleGender')]/text()") item["age"] = item["age"][0] if len(item["age"]) > 0 else None item["gender"] = div.xpath( ".//div[contains(@class,'articleGender')]/@class") # item["gender"] = item["gender"][0].split(' ')[-1].replace("Icon", "") if len(["gender"]) > 0 else None item["gender"] = item["gender"][0].split(" ")[-1].replace( "Icon", "") if len(item["gender"]) > 0 else None item["href"] = urllib.parse.urljoin(response.url, div.xpath("./a/@href")[0]) # print(item) yield Item(item) yield Request(item["href"], parse="parse_detail", meta={"item": item})
def start_requests(self): url_temp = 'https://www.qiushibaike.com/hot/page/{}/' for i in range(1, 14): yield Request(url_temp.format(i))
def start_requests(self): for url in self.start_urls: # 不指定callback,默认响应由parse解析 # 2. dont_filter 请求可以不去重 yield Request(url, dont_filter=True)
def start_requests(self): # 重写start_requests方法,返回多个请求 base_url = 'http://movie.douban.com/top250?start=' for i in range(0, 250, 25): # 逐个返回第1-10页的请求属相 url = base_url + str(i) yield Request(url)
def start_requests(self): """构建初始请求对象""" for url in self.start_urls: yield Request(url)
def start_requests(self): while True: # 需要发起这个请求,才能获取到列表页数据,并且返回的是一个js语句 url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&ch=&k=&offset_page=0&offset_num=0&num=120&asc=&page=1&r=0.5559616678192825" yield Request(url, parse='parse', filter=False) time.sleep(10) # 每10秒发起一次请求
def start_requests(self): for start_url in self.start_urls: yield Request(start_url, filter=False)
def start_request(self): return Request(self.start_url)
def start_requests(self): for url in self.start_urls: yield Request(url)
def start_requests(self): for url in self.start_urls: # 不指定callback,默认响应由parse解析 yield Request(url)