def parse_index(self, response): doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr['href'] weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request
def parse_index(self, response): doc = pq(response.text) items = doc(".news-list .txt-box h3 a").items() for item in items: url = item.attr("href") yield WeixinRequest(url=url, callback=self.parse_detail) page = doc("#sogou_next").attr("href") if page: url = self.base_url + str(page) yield WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
def parse_index(self, response): ''' 解析索引页 :return: ''' doc = pq(response.text) items = doc('.news-list > li > div.txt-box > h3 > a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=False) yield weixin_request
def start(self): #全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) #调度第一个请求 self.queue.add(weixin_request)
def start(self): self.session.headers.update(self.headers) # 更新 headers 参数 start_url = self.base_url + urlencode(the_dict) # 拼接url weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.add(weixin_request)
def start(self): ''' 开始第一个请求 :return: ''' self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'type':'2','query':self.keyword}) weixin_request = WeixinRequest(start_url,self.parse_index,need_proxy=True) self.queue.add(weixin_request)
def start(self): """ 初始化工作 """ start_url = self.base_url print(start_url) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=False) # 调度第一个请求 self.queue.add(weixin_request)
def parse_index(self, response): ''' 解析索引页 :param response: 响应 :return: 新的响应 ''' doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) # 列表页的详情页链接 yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) # 拼接下一页的URL weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) # 下一页的详情页链接 yield weixin_request
def parse_index(self,response): ''' 解析索引页 :param response: 响应 :return: 新的请求 ''' doc = pq(response.text) items = doc('.news-box .news-list li txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url,callback=self.parse_detail) yield weixin_request
def start(self): """ 初始化工作 :return: """ # 全局更新Headers,使得所有请求都能应用Cookies self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)#改URL构造WeixinRequest # 回调函数是Spider类的的parse_index方法,当请求成功后,用parse_index来处理和解析,need_proxy参数设置为True,需要代理 # 调度第一个请求 self.queue.add(weixin_request) # 调用RedisQueue的add方法,将请求加入队列,等待调度
def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + self.keyword weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request)
def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.row.hsd-article').items() for item in items: url = item('a').attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail, need_proxy=True) #请求详情页不需要代理ip yield weixin_request next = doc('.pager_right a').attr('href') if next: url = next weixin_request = WeixinRequest( url=url, callback=self.parse_index, need_proxy=True #请求列表页需要代理ip ) yield weixin_request
def start(self): self.session.headers.update(self.headers) start_url = self.base_url + "?" + urlencode({"query": self.keyword, "type": 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.add(weixin_request)
self.db = StrictRedis(host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD) def add(self, request): if isinstance(request, WeixinRequest): return self.db.rpush(REDIS_KEY, dumps(request)) return False def pop(self): if self.db.llen(REDIS_KEY): return loads(self.db.lpop(REDIS_KEY)) return False def clear(self): self.db.delete(REDIS_KEY) def empty(self): return self.db.llen(REDIS_KEY) == 0 if __name__ == '__main__': db = RedisQueue() start_url = 'http://www.baidu.com' weixin_request = WeixinRequest(url=start_url, callback='hello', need_proxy=True) db.add(weixin_request) request = db.pop() print(request) print(request.callback, request.need_proxy)