コード例 #1
0
    def parse(self, response):

        try:
            bs4 = BeautifulSoup(response.text, 'html.parser')
            for li in bs4.select('li'):
                info = {}
                info['url'] = 'http://www.pearvideo.com/' + li.select_one("a")['href']
                info['thumbnail'] = li.select_one("img")['src']
                info['title'] = li.select_one("h2").text
                info['vdo-time'] = li.select_one("div[class=\"vdo-time\"]").text
                info['publish-time'] = li.select_one("div[class=\"publish-time\"]").text
                info['cont'] = li.select_one("div[class=\"cont\"]").text
                info['i-icon_col-name'] = li.select_one("a[class=\"i-icon col-name\"]").text
                info['i-icon_like-num'] = li.select_one("span[class=\"i-icon like-num\"]").text

                # 发布新任务
                request = Request(url=info['url'], callback=self.parse_item, priority=3)
                request.info = info
                yield request
        except:
            traceback.print_exc()
        finally:
            print("正在添加新任务至队列头部")
            request = Request(url=response.url, dont_filter=True)
            yield request
            self.sleepMyself()
コード例 #2
0
 def start_requests(self):
     for url in self.start_urls:
         if 'api.1sapp' in url:
             for i in range(10):
                 urlNow = url.replace('{page}', str(i + 1)).replace(
                     '{timeStamp}', str(int(time.time() * 1000)))
                 request = Request(url=urlNow)
                 request.info = {'page': i}
                 yield request
             continue
         yield Request(url=url)
コード例 #3
0
    def parse(self, response):

        try:
            print(response.text)
            msgDict = json.loads(response.text)
            for data in msgDict['data']['data']:
                info = data
                info['fromSpider'] = '推荐流'
                request = Request(url=info['url'],
                                  priority=10,
                                  callback=self.parse_item)
                request.info = info
                yield request
        except:
            traceback.print_exc()
        finally:
            print("正在添加新任务至队列头部")
            request = Request(url=response.url, dont_filter=True)
            yield request
            self.sleepMyself()
コード例 #4
0
    def parse(self, response):

        try:
            print(response.text)
            bs4 = BeautifulSoup(response.text, 'html.parser')

            for li in bs4.select('li'):
                info = {}
                # 发布新任务
                request = Request(url=info['url'],
                                  callback=self.parse_item,
                                  priority=3)
                request.info = info
                yield request
        except:
            traceback.print_exc()
        finally:
            print("正在添加新任务至队列头部")
            request = Request(url=response.url, dont_filter=True)
            yield request
            self.sleepMyself()