def crawl(self): cid = self.key channel = CHANNELS[int(cid)] page = 1 pagesize = 30 while 1: try: data = api_shows(cid, page, pagesize) if data is not None: page += 1 else: return except: self.logger.warning(get_exception_info()) continue if not data.get('results'): break for item in data['results']: try: show_id = item['tid'] reset = (item['completed'] == 0) data = { 'channel': channel, 'image': item.get('show_vthumburl_hd') if item.get('show_vthumburl_hd') else item.get('show_thumburl_hd'), 'image2': item.get('show_thumburl_hd') } Scheduler.schedule( AlbumCrawler.type, key=show_id, data=data, reset=reset) except: self.logger.warning(get_exception_info())
def crawl(self): catecode = self.data["catecode"] last_updated = self.data.get("updated", datetime.min) current_updated = datetime.max max_time = last_updated page = 1 pagesize = 20 while True: try: data = api_albums(catecode, page, pagesize) for item in data["videos"]: try: sid = item.get('sid') detail = api_album(sid) if sid else None model = self.extract_model(item, detail) if sid: videos = self.get_videos(sid) if model['channel'] in [ u'综艺' ]: #reverse order for zongyi videos = [video for video in reversed(videos)] else: video = VideoItemModel({ "title": model["title"], "image": model["image"], "description": model["description"], "time": model["time"], "price": model["price"], "duration": model["duration"], "url": model["url"] }) videos = [video] model['videos'] = videos export(model) current_updated = model["time"] if max_time < current_updated: max_time = current_updated except: self.logger.warning(get_exception_info()) if current_updated < last_updated: break if page * pagesize >= data["count"]: break except: self.logger.warning(get_exception_info()) page += 1 self.data["updated"] = max_time
def crawl(self): catecode = self.data["catecode"] last_updated = self.data.get("updated", datetime.min) current_updated = datetime.max max_time = last_updated page = 1 pagesize = 20 while True: try: data = api_albums(catecode, page, pagesize) for item in data["videos"]: try: sid = item.get('sid') detail = api_album(sid) if sid else None model = self.extract_model(item, detail) if sid: videos = self.get_videos(sid) if model['channel'] in [u'综艺']: #reverse order for zongyi videos = [video for video in reversed(videos)] else: video = VideoItemModel({ "title" : model["title"], "image" : model["image"], "description" : model["description"], "time" : model["time"], "price" : model["price"], "duration" : model["duration"], "url" : model["url"] }) videos = [video] model['videos'] = videos export(model) current_updated = model["time"] if max_time < current_updated: max_time = current_updated except: self.logger.warning(get_exception_info()) if current_updated < last_updated: break if page * pagesize >= data["count"]: break except: self.logger.warning(get_exception_info()) page += 1 self.data["updated"] = max_time
def crawl(self): for spec in TOP_SPEC: video_set = set() videos = [] count = TOP_COUNT / len(spec['sources']) for src, param in spec['sources']: func = getattr(self, "crawl_%s" % src) if func: try: titles = func(param, count) for title in titles: if title not in video_set: video_set.add(title) videos.append(title) except: self.logger.warning(get_exception_info()) rank = VideoRankModel({ "source": self.data['source'], "title": spec["title"], "type": spec["type"], "videos": videos, }) export(rank)
def _service_worker(category): global _RUNNING_CRAWLER signal.signal(signal.SIGTERM, _worker_term_handler) start_time = time.time() while time.time() <= start_time + CrawlerConf._CRAWLER_PROCESS_MAX_TIME: item = Scheduler.fetch(category) _RUNNING_CRAWLER = item timer = Timer(item.get('timeout', CrawlerConf._CRAWLER_TIMEOUT_DEFAULT), _worker_timeout_handler) timer.start() c = Crawler.create(item["type"], item["key"], item["data"]) if c: try: c.crawl() success = True logger.info("CRAWL SUCCEED %s" % c) except Exception: msg = get_exception_info() success = False logger.error("CRAWL FAILED %s, %s" % (c, msg)) else: logger.warn("CRAWL CREATE FAILED %s" % item) success = False timer.cancel() _RUNNING_CRAWLER = None Scheduler.finish(item['type'], item['key'], c.data if c else {}, success)
def get_albums(self, cid): cid = "%s,0~0~0" % cid params = dict(PARAMS_PROTOTYPE) params.update(PARAMS_LIST) params['category_id'] = cid while True: try: ret = call_api("getViewObject", params) if not ret.albumIdList: break # list[0] is ordered by time, list[1] is recommendation ids = ret.albumIdList[0]['idlist'] if not ids: break for album_id in ids: album = ret.albumArray.get(int(album_id)) if album: yield album except GeneratorExit: return except: self.logger.warning(get_exception_info()) continue params['pn'] += 1
def crawl(self): for spec in TOP_SPEC: video_set = set() videos = [] count = TOP_COUNT / len(spec['sources']) for src, param in spec['sources']: func = getattr(self, "crawl_%s" % src) if func: try: titles = func(param, count) for title in titles: if title not in video_set: video_set.add(title) videos.append(title) except: self.logger.warning(get_exception_info()) rank = VideoRankModel({ "source" : self.data['source'], "title" : spec["title"], "type" : spec["type"], "videos" : videos, }) export(rank)
def crawl(self): channel = self.key for year in range(self.data["year"], datetime.utcnow().year + 1): for site in SITES: try: items = api_history(year, channel, site) except: self.logger.warning(get_exception_info()) continue for item in items: try: self.process_album(item) except: self.logger.warning(get_exception_info()) self.data["year"] = datetime.utcnow().year
def crawl(self): cid = self.key channel = CHANNELS[int(cid)] page = 1 pagesize = 30 while 1: try: data = api_shows(cid, page, pagesize) if data is not None: page += 1 else: return except: self.logger.warning(get_exception_info()) continue if not data.get('results'): break for item in data['results']: try: show_id = item['tid'] reset = (item['completed'] == 0) data = { 'channel': channel, 'image': item.get('show_vthumburl_hd') if item.get('show_vthumburl_hd') else item.get('show_thumburl_hd'), 'image2': item.get('show_thumburl_hd') } Scheduler.schedule(AlbumCrawler.type, key=show_id, data=data, reset=reset) except: self.logger.warning(get_exception_info())
def handle(self, *args, **options): if len(args) < 1: self.hint() return command = args[0] _args = [] for i in range(1, len(args)): arg = args[i] if arg.isdigit(): arg = int(arg) _args.append(arg) func = getattr(scripts, command) if not callable(func): self.hint() else: try: func(*_args) except SystemExit: print 'process exit.' except: print get_exception_info()
def crawl_video(self, show_id): videos = [] page = 1 pagesize = 30 while True: data = api_videos(show_id, page, pagesize) if not data['results']: break for item in data['results']: try: video = VideoItemModel({ "title": item['title'], "source_id": item['videoid'], "url": "http://v.youku.com/v_show/id_%s.html" % item['videoid'], }) jsurl = "javascript: getUrl('youku', '%s')" % item[ "videoid"] video["stream"] = [{"url": jsurl}] # TODO: # ret = api_plays(item['videoid']) # results = ret.get('results', {}) # for key, fmt in FORMATS_NORMAL: # if results.get(key): # video["stream_low"] = self.extract_stream(results[key], fmt) # break # for key, fmt in FORMATS_HIGH: # if results.get(key): # video["stream"] = self.extract_stream(results[key], fmt) # break # for key, fmt in FORMATS_HD: # if results.get(key): # video["stream_high"] = self.extract_stream(results[key], fmt) # break videos.append(video) except: self.logger.warning(get_exception_info()) if pagesize * page >= data['total']: break page += 1 return videos
def get_video(self, album_id, video_id): album_id = str(album_id) video_id = str(video_id) params = dict(PARAMS_PROTOTYPE) params.update(PARAMS_ALBUM) try: ret = call_api("getAlbum", params, [album_id, None, None, video_id, None, '1', '0']) if not ret.tv[0]._id or ret.tv[0]._id != video_id: ret = call_api("getAlbum", params, [video_id, None, None, None, '1', '0']) except: self.logger.warning(get_exception_info()) return for i in range(ret.tv['count']): if ret.tv[i]._id == video_id: return extract_video(ret.tv[i]) raise Exception("No video found for video_id = %s" % video_id)
def get_video(self, album_id, video_id): album_id = str(album_id) video_id = str(video_id) params = dict(PARAMS_PROTOTYPE) params.update(PARAMS_ALBUM) try: ret = call_api("getAlbum", params, [ album_id, None, None, video_id, None, '1', '0']) if not ret.tv[0]._id or ret.tv[0]._id != video_id: ret = call_api("getAlbum", params, [ video_id, None, None, None, '1', '0']) except: self.logger.warning(get_exception_info()) return for i in range(ret.tv['count']): if ret.tv[i]._id == video_id: return extract_video(ret.tv[i]) raise Exception("No video found for video_id = %s" % video_id)
def crawl(self): cid = self.data['cid'] current_time = int(time.time()) for album_data in self.get_albums(cid): try: album = extract_album(album_data, self.data['source']) if not album: continue checkup_time = time.mktime(album['time'].timetuple()) # can't get video for paid item if (not album["price"]) and album.get('source_id'): Scheduler.schedule( type=AlbumCrawler.type, key=album['source_id'], data={"time": album["time"]}, reset=(current_time - checkup_time) < 86400) except: self.logger.warning(get_exception_info()) self.data['updated'] = current_time
def crawl(self): cid = self.data['cid'] current_time = int(time.time()) for album_data in self.get_albums(cid): try: album = extract_album(album_data, self.data['source']) if not album: continue checkup_time = time.mktime(album['time'].timetuple()) # can't get video for paid item if (not album["price"]) and album.get('source_id'): Scheduler.schedule( type=AlbumCrawler.type, key=album['source_id'], data={"time": album["time"]}, reset=(current_time - checkup_time) < 86400 ) except: self.logger.warning(get_exception_info()) self.data['updated'] = current_time
def crawl(self): min_time = self.data['updated'] #上次爬取到最新视频的更新时间, 为本次爬取的时间下界 max_time = None #本次抓取的最新视频的时间 page = 1 while True: url = "http://bdzy.cc/list/?0-%s.html" % page hxs = load_html(url) #读取网页html, 返回一个HtmlXPathSelector time = None for s in hxs.select("//body/.//tr[@class='row']"): #用xpath解析html try: href = s.select("td[1]/a/@href").extract()[0] source_id = re.findall("(\d+)\.html", href)[0] #源站ID title = clean_title(s.select("td[1]/.//text()").extract()[0]) region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"") category = s.select("td[3]/.//text()").extract()[0] time = s.select("td[4]/.//text()").extract()[0] time = datetime.strptime(time, "%Y-%m-%d") if not max_time: #第一条是最新更新的 max_time = time if time < min_time: #已经爬取到上次最新的数据 break data = { #详情页爬虫任务的附加数据 "title" : title, "time" : time, "category" : category, "region" : region, } #获取对应详情页爬虫的附加数据,用time字段判断该内容是否已经更新,需要重新抓取. 如果第一次创建,则数据为空 lastdata = Scheduler.get_data(AlbumCrawler.type, source_id) lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min #创建相应的专辑爬虫,爬取相应的详情页. key为源站id Scheduler.schedule( AlbumCrawler.type, source_id, data, reset = data["time"] > lasttime #是否需要强制重新抓取 ) except: self.logger.warning(get_exception_info()) #纪录错误信息并继续 continue if time and time < min_time: #已经爬取到上次最新的数据 break #获取总页数 text = hxs.select("//div[@class='pages']/span/text()").extract()[0] page_count = int(re.findall(u"\d+/(\d+)页", text)[0]) #超过总页数 if page >= page_count: break page += 1 if max_time: self.data = {'updated' : max_time} #保存上次爬取到的最新的时间