コード例 #1
0
 def parse_index(self, response):
     doc = pq(response.text)
     items = doc('.news-box .news-list li .txt-box h3 a').items()
     for item in items:
         url = item.attr['href']
         weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
         yield weixin_request
     next = doc('#sogou_next').attr('href')
     if next:
         url = self.base_url + str(next)
         weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
         yield weixin_request
コード例 #2
0
    def parse_index(self, response):
        doc = pq(response.text)
        items = doc(".news-list .txt-box h3 a").items()

        for item in items:
            url = item.attr("href")
            yield WeixinRequest(url=url, callback=self.parse_detail)
        
        page = doc("#sogou_next").attr("href")

        if page:
            url = self.base_url + str(page)
            yield WeixinRequest(url=url, callback=self.parse_index, need_proxy=True)
コード例 #3
0
 def parse_index(self, response):
     '''
     解析索引页
     :return:
     '''
     doc = pq(response.text)
     items = doc('.news-list > li > div.txt-box > h3 > a').items()
     for item in items:
         url = item.attr('href')
         weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
         yield weixin_request
     next = doc('#sogou_next').attr('href')
     if next:
         url = self.base_url + str(next)
         weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=False)
         yield weixin_request
コード例 #4
0
 def start(self):
     #全局更新Headers
     self.session.headers.update(self.headers)
     start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2})
     weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
     #调度第一个请求
     self.queue.add(weixin_request)
コード例 #5
0
    def start(self):
        self.session.headers.update(self.headers)  # 更新 headers 参数
        start_url = self.base_url + urlencode(the_dict)  # 拼接url
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)

        self.queue.add(weixin_request)
コード例 #6
0
 def start(self):
     '''
     开始第一个请求
     :return:
     '''
     self.session.headers.update(self.headers)
     start_url = self.base_url + '?' + urlencode({'type':'2','query':self.keyword})
     weixin_request = WeixinRequest(start_url,self.parse_index,need_proxy=True)
     self.queue.add(weixin_request)
コード例 #7
0
 def start(self):
     """
     初始化工作
     """
     start_url = self.base_url
     print(start_url)
     weixin_request = WeixinRequest(url=start_url,
                                    callback=self.parse_index,
                                    need_proxy=False)
     # 调度第一个请求
     self.queue.add(weixin_request)
コード例 #8
0
 def parse_index(self, response):
     '''
     解析索引页
     :param response: 响应
     :return: 新的响应
     '''
     doc = pq(response.text)
     items = doc('.news-box .news-list li .txt-box h3 a').items()
     for item in items:
         url = item.attr('href')
         weixin_request = WeixinRequest(url=url, callback=self.parse_detail)
         # 列表页的详情页链接
         yield weixin_request
     next = doc('#sogou_next').attr('href')
     if next:
         url = self.base_url + str(next)  # 拼接下一页的URL
         weixin_request = WeixinRequest(url=url,
                                        callback=self.parse_index,
                                        need_proxy=True)
         # 下一页的详情页链接
         yield weixin_request
コード例 #9
0
 def parse_index(self,response):
     '''
     解析索引页
     :param response: 响应
     :return: 新的请求
     '''
     doc = pq(response.text)
     items = doc('.news-box .news-list li txt-box h3 a').items()
     for item in items:
         url = item.attr('href')
         weixin_request = WeixinRequest(url=url,callback=self.parse_detail)
         yield weixin_request
 def start(self):
     """
     初始化工作
     :return:
     """
     # 全局更新Headers,使得所有请求都能应用Cookies
     self.session.headers.update(self.headers)
     start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2})
     weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)#改URL构造WeixinRequest
     # 回调函数是Spider类的的parse_index方法,当请求成功后,用parse_index来处理和解析,need_proxy参数设置为True,需要代理
     # 调度第一个请求
     self.queue.add(weixin_request)  # 调用RedisQueue的add方法,将请求加入队列,等待调度
コード例 #11
0
    def start(self):
        """
        初始化工作
        """
        # 全局更新Headers

        self.session.headers.update(self.headers)
        start_url = self.base_url + self.keyword
        weixin_request = WeixinRequest(url=start_url,
                                       callback=self.parse_index,
                                       need_proxy=True)
        # 调度第一个请求
        self.queue.add(weixin_request)
コード例 #12
0
 def parse_index(self, response):
     """
     解析索引页
     :param response: 响应
     :return: 新的响应
     """
     doc = pq(response.text)
     items = doc('.row.hsd-article').items()
     for item in items:
         url = item('a').attr('href')
         weixin_request = WeixinRequest(url=url,
                                        callback=self.parse_detail,
                                        need_proxy=True)  #请求详情页不需要代理ip
         yield weixin_request
     next = doc('.pager_right a').attr('href')
     if next:
         url = next
         weixin_request = WeixinRequest(
             url=url,
             callback=self.parse_index,
             need_proxy=True  #请求列表页需要代理ip
         )
         yield weixin_request
コード例 #13
0
 def start(self):
     self.session.headers.update(self.headers)
     start_url = self.base_url + "?" + urlencode({"query": self.keyword, "type": 2})
     weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)
     self.queue.add(weixin_request)
コード例 #14
0
ファイル: db_redis.py プロジェクト: authetic-x/Web_Scraping
        self.db = StrictRedis(host=REDIS_HOST,
                              port=REDIS_PORT,
                              password=REDIS_PASSWORD)

    def add(self, request):
        if isinstance(request, WeixinRequest):
            return self.db.rpush(REDIS_KEY, dumps(request))
        return False

    def pop(self):
        if self.db.llen(REDIS_KEY):
            return loads(self.db.lpop(REDIS_KEY))
        return False

    def clear(self):
        self.db.delete(REDIS_KEY)

    def empty(self):
        return self.db.llen(REDIS_KEY) == 0


if __name__ == '__main__':
    db = RedisQueue()
    start_url = 'http://www.baidu.com'
    weixin_request = WeixinRequest(url=start_url,
                                   callback='hello',
                                   need_proxy=True)
    db.add(weixin_request)
    request = db.pop()
    print(request)
    print(request.callback, request.need_proxy)