def parse(self, response): try: bs4 = BeautifulSoup(response.text, 'html.parser') for li in bs4.select('li'): info = {} info['url'] = 'http://www.pearvideo.com/' + li.select_one("a")['href'] info['thumbnail'] = li.select_one("img")['src'] info['title'] = li.select_one("h2").text info['vdo-time'] = li.select_one("div[class=\"vdo-time\"]").text info['publish-time'] = li.select_one("div[class=\"publish-time\"]").text info['cont'] = li.select_one("div[class=\"cont\"]").text info['i-icon_col-name'] = li.select_one("a[class=\"i-icon col-name\"]").text info['i-icon_like-num'] = li.select_one("span[class=\"i-icon like-num\"]").text # 发布新任务 request = Request(url=info['url'], callback=self.parse_item, priority=3) request.info = info yield request except: traceback.print_exc() finally: print("正在添加新任务至队列头部") request = Request(url=response.url, dont_filter=True) yield request self.sleepMyself()
def start_requests(self): for url in self.start_urls: if 'api.1sapp' in url: for i in range(10): urlNow = url.replace('{page}', str(i + 1)).replace( '{timeStamp}', str(int(time.time() * 1000))) request = Request(url=urlNow) request.info = {'page': i} yield request continue yield Request(url=url)
def parse(self, response): try: print(response.text) msgDict = json.loads(response.text) for data in msgDict['data']['data']: info = data info['fromSpider'] = '推荐流' request = Request(url=info['url'], priority=10, callback=self.parse_item) request.info = info yield request except: traceback.print_exc() finally: print("正在添加新任务至队列头部") request = Request(url=response.url, dont_filter=True) yield request self.sleepMyself()
def parse(self, response): try: print(response.text) bs4 = BeautifulSoup(response.text, 'html.parser') for li in bs4.select('li'): info = {} # 发布新任务 request = Request(url=info['url'], callback=self.parse_item, priority=3) request.info = info yield request except: traceback.print_exc() finally: print("正在添加新任务至队列头部") request = Request(url=response.url, dont_filter=True) yield request self.sleepMyself()