def crawl(self): cid = self.key channel = CHANNELS[int(cid)] page = 1 pagesize = 30 while 1: try: data = api_shows(cid, page, pagesize) if data is not None: page += 1 else: return except: self.logger.warning(get_exception_info()) continue if not data.get('results'): break for item in data['results']: try: show_id = item['tid'] reset = (item['completed'] == 0) data = { 'channel': channel, 'image': item.get('show_vthumburl_hd') if item.get('show_vthumburl_hd') else item.get('show_thumburl_hd'), 'image2': item.get('show_thumburl_hd') } Scheduler.schedule( AlbumCrawler.type, key=show_id, data=data, reset=reset) except: self.logger.warning(get_exception_info())
def crawl(self): cid = self.key page = 1 pagesize = 30 while True: list_data = api_list(cid, page, pagesize) # 数据不为空,取出数据,page+=1 if list_data.get('results'): for item in list_data.get('results'): # m = re.match("^\d+$", item['tid']) # if m: source_id = item['tid'] reset = int(item['completed']) == 0 Scheduler.schedule( type=AlbumCrawler.type, key=str(source_id), reset=reset ) page += 1 # 数据为空跳出循环 if not list_data.get('results'): if page > 100: return else: page += 1
def crawl(self): cid = self.key page = 1 pagesize = 30 while True: list_data = api_list(cid, page, pagesize) # 数据不为空,取出数据,page+=1 if list_data.get('results'): for item in list_data.get('results'): # m = re.match("^\d+$", item['tid']) # if m: source_id = item['tid'] reset = int(item['completed']) == 0 Scheduler.schedule(type=AlbumCrawler.type, key=str(source_id), reset=reset) page += 1 # 数据为空跳出循环 if not list_data.get('results'): if page > 100: return else: page += 1
def crawler_job(request): if request.method == "GET": url = request.GET.get('url', None) raw_job = Scheduler.get_job_from_url(url) job = {} if raw_job: job = { 'type': raw_job['type'], 'key': raw_job['key'], 'priority': raw_job['priority'], 'interval': raw_job['interval'], 'lastrun': raw_job['lastrun'], 'status': raw_job['status'], 'to_album_id': raw_job['data'].get('to_album_id'), } return job else: url = request.GET.get('url', None) interval = request.GET.get('interval', 3600) channel = request.GET.get('channel', None) image = request.GET.get('image', None) data = { 'channel':channel, 'image': image, } success = Scheduler.schedule_url(url, data=data, interval=int(interval), reset=True) return {'status': int(success)}
def crawl(self): cid = self.key channel = self.data.get("channel_name") itemid = 0 date = 0 areaid = 0 sort = 2 # 1:最新 2:最热 start = 0 num = 30 while True: list_data = api_list(cid, itemid, date, areaid, sort, start, num, pcode, version) list_data = list_data.get("body") if list_data.get("data"): for item in list_data['data']: source_id = item.get("id") image = item.get("icon") reset = int(item['isend']) == 0 Scheduler.schedule(type=AlbumCrawler.type, key=source_id, data={ "channel": channel, "image": image }, reset=reset) start += 1 if not list_data.get("data"): return
def crawl(self): cid = self.key channel = self.data.get("channel_name") itemid = 0 date = 0 areaid = 0 sort = 2 # 1:最新 2:最热 start = 0 num = 30 while True: list_data = api_list( cid, itemid, date, areaid, sort, start, num, pcode, version) list_data = list_data.get("body") if list_data.get("data"): for item in list_data['data']: source_id = item.get("id") image = item.get("icon") reset = int(item['isend']) == 0 Scheduler.schedule( type=AlbumCrawler.type, key=source_id, data={ "channel": channel, "image": image }, reset=reset ) start += 1 if not list_data.get("data"): return
def init(conf=None): if not conf: conf = {} for cid in CHANNELS.keys(): Scheduler.schedule(CategoryCrawler.type, key=str(cid), priority=conf.get('priority', Priority.High), interval=conf.get('interval', 3600))
def crawl(self): min_time = self.data.get('updated', datetime.min) if self.data else datetime.min max_time = None time = None page = 1 while True: url = "http://www.265zy.com/list/?0-%s.html" % page hxs = load_html(url) for s in hxs.select("//body/.//tr[@class='row']"): try: href = s.select("td[1]/a/@href").extract()[0] source_id = re.findall("(\d+)\.html", href)[0] title = clean_title( s.select("td[1]/.//text()").extract()[0]) region = s.select("td[2]/.//text()").extract()[0].replace( u"地区", u"") category = s.select("td[3]/.//text()").extract()[0] time = s.select("td[4]/.//text()").extract()[0] time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") except: continue if not max_time: max_time = time if time < min_time: break data = { "title": title, "time": time, 'category': category, 'region': region, } lastdata = Scheduler.get_data(AlbumCrawler.type, source_id) lasttime = lastdata.get( "time", datetime.min) if lastdata else datetime.min Scheduler.schedule(type=AlbumCrawler.type, key=source_id, data=data, reset=data['time'] > lasttime) if time and time < min_time: break text = hxs.select("//div[@class='pages']/span/text()").extract()[0] page_count = int(re.findall(u"\d+/(\d+)页", text)[0]) if page >= 5: break page += 1 if max_time: if not self.data: self.data = {} self.data['updated'] = max_time
def init(conf=None): if not conf: conf = {} for catecode in CHANNELS.keys(): Scheduler.schedule(CategoryCrawler.type, key=str(catecode), data={"catecode": catecode}, priority=conf.get('priority', Priority.Normal), interval=conf.get('interval', 3600))
def init(conf=None): if not conf: conf = {} for channel in CHANNELS.iterkeys(): Scheduler.schedule(HistoryCrawler.type, key=channel, data={"year": 1900}, priority=conf.get('priority', Priority.Normal), interval=conf.get('interval', 86400))
def init(conf=None): if not conf: conf = {} for id in _CHANNEL_DCT.iterkeys(): Scheduler.schedule(CategoryCrawler.type, key=str(id), data={"cid": id}, priority=conf.get('priority', Priority.High), interval=conf.get('interval', 3600))
def init(conf=None): if not conf: conf = {} Scheduler.schedule( ListCrawler.type, #爬虫类型 key="", #该类型只有一个实例,key设"" priority=conf.get('priority', Priority.High), #优先级为高 data={'updated': datetime.min}, #附加数据为上次爬取到的最新视频的更新时间 interval=conf.get('interval', 3600) #循环抓取间隔为1小时 )
def init(conf = None): if not conf: conf = {} Scheduler.schedule( ListCrawler.type, #爬虫类型 key = "", #该类型只有一个实例,key设"" priority = conf.get('priority', Priority.High), #优先级为高 data = {'updated' : datetime.min}, #附加数据为上次爬取到的最新视频的更新时间 interval = conf.get('interval', 3600) #循环抓取间隔为1小时 )
def init(conf=None): if not conf: conf = {} for id, channel in CHANNELS.iteritems(): data = {"channel": channel} Scheduler.schedule(ListCrawler.type, key=id, data=data, priority=conf.get('priority', Priority.High), interval=conf.get('interval', 3600))
def init(conf = None): if not conf: conf = {} Scheduler.schedule( ListCrawler.type, #鐖櫕绫诲瀷 key = "", #璇ョ被鍨嬪彧鏈変竴涓疄渚�,key璁�"" priority = conf.get('priority', Priority.High), #浼樺厛绾т负楂� data = {'updated' : datetime.min}, #闄勫姞鏁版嵁涓轰笂娆$埇鍙栧埌鐨勬渶鏂拌棰戠殑鏇存柊鏃堕棿 interval = conf.get('interval', 3600) #寰幆鎶撳彇闂撮殧涓�忔椂 )
def init(conf=None): if not conf: conf = {} for key, data in CHANNELS.iteritems(): Scheduler.schedule( ListCrawler.type, key=key, data=data, priority=conf.get('priority', Priority.High), interval=conf.get('interval', 3600) )
def crawl(self): min_time = self.data.get('updated', datetime.min) if self.data else datetime.min max_time = None time = None page = 1 while True: url = "http://zyqvod.com/?page=%s" % page hxs = load_html(url) for s in hxs.select("//table[@id='listTable']/tbody/tr"): try: source_id = re.findall("id=(\d+)", s.select("td[1]/a/@href").extract()[0])[0] title = clean_title(s.select("td[1]/.//text()").extract()[0]) category = s.select("td[2]/.//text()").extract()[0] region = s.select("td[3]/.//text()").extract()[0] completed = s.select("td[4]/.//text()").extract()[0] == u"完结" time = s.select("td[5]/.//text()").extract()[0] time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") except: continue if not max_time: max_time = time if time < min_time: break data = { "title" : title, "time" : time, 'category' : category, 'region' : region, 'completed' : completed, } lastdata = Scheduler.get_data(AlbumCrawler.type, source_id) lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min Scheduler.schedule(type = AlbumCrawler.type, key = source_id, data = data, reset = data['time'] > lasttime) if time and time < min_time: break text = hxs.select("//div[@class='page_num']/text()").extract()[0] page_count = int(re.findall(u"\d+/(\d+)页", text)[0]) if page >= page_count: break page += 1 if max_time: if not self.data: self.data = {} self.data['updated'] = max_time
def init(conf=None): if not conf: conf = {} data = api_channel(pcode, version) for channel in data['body']['channel']: cid = channel.get("cid") channel_name = channel.get("name") crawl_data = { "channel_id": cid, "channel_name": channel_name, } Scheduler.schedule(ListCrawler.type, key=cid, data=crawl_data, priority=conf.get( 'priority', Priority.High), interval=conf.get('interval', 3600))
def crawl(self): min_time = self.data.get('updated', datetime.min) if self.data else datetime.min max_time = None time = None page = 1 while True: url = "http://www.265zy.com/list/?0-%s.html" % page hxs = load_html(url) for s in hxs.select("//body/.//tr[@class='row']"): try: href = s.select("td[1]/a/@href").extract()[0] source_id = re.findall("(\d+)\.html", href)[0] title = clean_title(s.select("td[1]/.//text()").extract()[0]) region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"") category = s.select("td[3]/.//text()").extract()[0] time = s.select("td[4]/.//text()").extract()[0] time = datetime.strptime(time, "%Y-%m-%d %H:%M:%S") except: continue if not max_time: max_time = time if time < min_time: break data = { "title" : title, "time" : time, 'category' : category, 'region' : region, } lastdata = Scheduler.get_data(AlbumCrawler.type, source_id) lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min Scheduler.schedule(type = AlbumCrawler.type, key = source_id, data = data, reset = data['time'] > lasttime) if time and time < min_time: break text = hxs.select("//div[@class='pages']/span/text()").extract()[0] page_count = int(re.findall(u"\d+/(\d+)页", text)[0]) if page >= 5: break page += 1 if max_time: if not self.data: self.data = {} self.data['updated'] = max_time
def crawl(self): type = 3 channel_id = self.key channel = self.data['channel'] start = 0 num = 16 params = { "order": "times", "time": "today" } while 1: list_data = api_list(type, channel_id, start, num, params) if start == list_data['num']: return for item in list_data['data']: if channel in LONG_VIDEO_CHANNELS.values(): source_id = item['mid'] tags = [] time = item['public_time'] time = datetime.strptime(time, "%Y%m%d") lastdata = Scheduler.get_data(AlbumCrawler.type, source_id) lasttime = lastdata.get( "time", datetime.min) if lastdata else datetime.min reset = time > lasttime else: source_id = item['flvid'] tags = item.get("tags").split(",") time = datetime.utcnow() reset = False data = { "url": item.get("web_url"), "title": item.get("title"), "image": item.get("bpic"), "image2": item.get("mpic"), "description": item.get("introduce"), "duration": item.get("duration"), "tags": tags, "time": time, "channel": channel } Scheduler.schedule( AlbumCrawler.type, source_id, data, reset=reset ) start += 1
def init(conf=None): if not conf: conf = {} for id, channel in CHANNELS.iteritems(): data = { "channel": channel } Scheduler.schedule( ListCrawler.type, key=id, data=data, priority=conf.get('priority', Priority.High), interval=conf.get('interval', 3600) )
def process_album(self, item): sites = {} fangying_id = re.findall("f_(.+)\.html", item['link'])[0] for play in item['plays']: site = play['site'] if site not in SITES: continue if play["url"].find("fangying.com") != -1: stream = [] else: format = "thunder" if site == "thunder" else "" stream = [{"url" : play["url"], "format" : format}] video = VideoItemModel({ "title" : play["title"], "url" : play["url"], "stream" : stream, }) if not sites.has_key(site): sites[site] = [] sites[site].append(dict(video)) model = None for site, videos in sites.iteritems(): model = VideoSourceModel({ "source" : self.data['source'], "source_id" : fangying_id, "videos" : videos, "title" : item['title'], "directors" : item['directors'].split("/"), "actors" : item['performers'].split("/"), "description" : item['description'], 'categories' : item['genres'].split("/"), 'region' : item['countries'].split("/")[0], 'duration' : parse_duration(item['duration']), 'image' : item['avatar_middle'], 'score' : float(item['douban_rating']) if item.get('douban_rating') else None, 'url' : item['link'], 'price' : 0.0, 'pubtime' : parse_pubtime(item['release_time']), 'channel' : CHANNELS.get(self.key) }) export(model) if model: Scheduler.schedule(RelationCrawler.type, key = fangying_id, data = {'title' : model['title'], 'url' : model['url']})
def crawl(self): page = 1 while (True): url = HOT_LIST % page video_list = loadurl(url) if video_list == None: break else: for videoinfo in video_list: video = videoinfo['video'][0] video['source'] = SOURCE Scheduler.schedule(AlbumCrawler.type, video.get('id'), data=video) page += 1
def init(conf=None): if not conf: conf = {} data = api_channel(pcode, version) for channel in data['body']['channel']: cid = channel.get("cid") channel_name = channel.get("name") crawl_data = { "channel_id": cid, "channel_name": channel_name, } Scheduler.schedule(ListCrawler.type, key=cid, data=crawl_data, priority=conf.get('priority', Priority.High), interval=conf.get('interval', 3600))
def crawl(self): type = 3 channel_id = self.key channel = self.data['channel'] start = 0 num = 16 params = {"order": "times", "time": "today"} while 1: list_data = api_list(type, channel_id, start, num, params) if start == list_data['num']: return for item in list_data['data']: if channel in LONG_VIDEO_CHANNELS.values(): source_id = item['mid'] tags = [] time = item['public_time'] time = datetime.strptime(time, "%Y%m%d") lastdata = Scheduler.get_data(AlbumCrawler.type, source_id) lasttime = lastdata.get( "time", datetime.min) if lastdata else datetime.min reset = time > lasttime else: source_id = item['flvid'] tags = item.get("tags").split(",") time = datetime.utcnow() reset = False data = { "url": item.get("web_url"), "title": item.get("title"), "image": item.get("bpic"), "image2": item.get("mpic"), "description": item.get("introduce"), "duration": item.get("duration"), "tags": tags, "time": time, "channel": channel } Scheduler.schedule(AlbumCrawler.type, source_id, data, reset=reset) start += 1
def crawl(self): page = 1 while(True): url = HOT_LIST % page video_list = loadurl(url) if video_list == None: break else: for videoinfo in video_list: video = videoinfo['video'][0] video['source'] = SOURCE Scheduler.schedule( AlbumCrawler.type, video.get('id'), data = video ) page+=1
def crawl(self): for channel in CHANNELS: list_url=LIST % (DICT.get(channel), 1)#杩涘叆姒滃崟绗竴椤� pagenum=int(loadurl(list_url).get('pagenum')) for page in range(pagenum): page+=1#褰撳墠椤典粠1璁℃暟 current_url=LIST % (DICT.get(channel), page) lists=loadurl(current_url).get('lists') for episode in lists: data={ 'title':episode.get('name'), 'image':episode.get('pic'), 'category':episode.get('cate'), 'channel':channel, 'source':SOURCE } Scheduler.schedule( AlbumCrawler.type, episode.get('mid'), data, reset=True )
def crawl(self): cid = self.key channel = CHANNELS[int(cid)] page = 1 pagesize = 30 while 1: try: data = api_shows(cid, page, pagesize) if data is not None: page += 1 else: return except: self.logger.warning(get_exception_info()) continue if not data.get('results'): break for item in data['results']: try: show_id = item['tid'] reset = (item['completed'] == 0) data = { 'channel': channel, 'image': item.get('show_vthumburl_hd') if item.get('show_vthumburl_hd') else item.get('show_thumburl_hd'), 'image2': item.get('show_thumburl_hd') } Scheduler.schedule(AlbumCrawler.type, key=show_id, data=data, reset=reset) except: self.logger.warning(get_exception_info())
def crawl(self): cid = self.data['cid'] current_time = int(time.time()) for album_data in self.get_albums(cid): try: album = extract_album(album_data, self.data['source']) if not album: continue checkup_time = time.mktime(album['time'].timetuple()) # can't get video for paid item if (not album["price"]) and album.get('source_id'): Scheduler.schedule( type=AlbumCrawler.type, key=album['source_id'], data={"time": album["time"]}, reset=(current_time - checkup_time) < 86400) except: self.logger.warning(get_exception_info()) self.data['updated'] = current_time
def crawl(self): cid = self.data['cid'] current_time = int(time.time()) for album_data in self.get_albums(cid): try: album = extract_album(album_data, self.data['source']) if not album: continue checkup_time = time.mktime(album['time'].timetuple()) # can't get video for paid item if (not album["price"]) and album.get('source_id'): Scheduler.schedule( type=AlbumCrawler.type, key=album['source_id'], data={"time": album["time"]}, reset=(current_time - checkup_time) < 86400 ) except: self.logger.warning(get_exception_info()) self.data['updated'] = current_time
def schedule(request): response = {} type = request.GET.get("type") nextrun = request.GET.get("nextrun") if type.endswith("album"): response['error_info'] = "Type Error." response['status'] = False return response try: nextrun = datetime.strptime(nextrun, "%Y-%m-%d-%H-%M-%S") except: response['error_info'] = "Datetime Error." response['status'] = False return response m = Scheduler.monitor_schedule(type, nextrun) if m is not None: response['error_info'] = "" response['status'] = True else: response['error_info'] = "Type Error." response['status'] = False return response
分两个爬虫抓取,一个抓取更新榜单,另一个抓取单个视频详情 抓取榜单的爬虫会创建相应的详情爬虫,并传给相应的参数和数据 更新榜单地址: http://bdzy.cc/list/?0-1.html 视频详情地址: http://bdzy.cc/detail/?20808.html ''' import requests, re, HTMLParser from datetime import datetime from scrapy.selector import HtmlXPathSelector from contentservice.crawler import Crawler, Scheduler, Priority, export from contentservice.models.video import VideoSourceModel, VideoItemModel from contentservice.utils.datetimeutil import parse_date from contentservice.utils.text import split from contentservice.utils import get_exception_info ''' 重要方法说明 Scheduler.schedule(type, key, priority, data, reset, interval, timeout) type - 爬虫任务的类型 key - 爬虫任务在该类型中的唯一标识(type和key组合起来唯一标识所爬取的内容,key通常为源站id) priority - 任务优先级 High, Normal, Low data - 附加数据(用来持久化跟该爬虫实例相关的数据),附加数据每次运行完成会自动持久化 reset - 是否强制重新抓取。默认不会重新抓取已经完成的任务 interval - 任务循环运行的间隔时间,0为只运行一次,默认值为0 timeout - 超时时间,超时会自动杀死任务 Crawler 爬虫基类,每类爬虫都需要继承这个类 方法: init(conf=None) 初始化(每次程序启动调用一次),用于创建起始爬虫任务 crawl() 爬取代码的主函数
def process_album(self, item): sites = {} fangying_id = re.findall("f_(.+)\.html", item['link'])[0] for play in item['plays']: site = play['site'] if site not in SITES: continue if play["url"].find("fangying.com") != -1: stream = [] else: format = "thunder" if site == "thunder" else "" stream = [{"url": play["url"], "format": format}] video = VideoItemModel({ "title": play["title"], "url": play["url"], "stream": stream, }) if not sites.has_key(site): sites[site] = [] sites[site].append(dict(video)) model = None for site, videos in sites.iteritems(): model = VideoSourceModel({ "source": self.data['source'], "source_id": fangying_id, "videos": videos, "title": item['title'], "directors": item['directors'].split("/"), "actors": item['performers'].split("/"), "description": item['description'], 'categories': item['genres'].split("/"), 'region': item['countries'].split("/")[0], 'duration': parse_duration(item['duration']), 'image': item['avatar_middle'], 'score': float(item['douban_rating']) if item.get('douban_rating') else None, 'url': item['link'], 'price': 0.0, 'pubtime': parse_pubtime(item['release_time']), 'channel': CHANNELS.get(self.key) }) export(model) if model: Scheduler.schedule(RelationCrawler.type, key=fangying_id, data={ 'title': model['title'], 'url': model['url'] })
def crawl(self): min_time = self.data['updated'] #上次爬取到最新视频的更新时间, 为本次爬取的时间下界 max_time = None #本次抓取的最新视频的时间 page = 1 while True: url = "http://bdzy.cc/list/?0-%s.html" % page hxs = load_html(url) #读取网页html, 返回一个HtmlXPathSelector time = None for s in hxs.select("//body/.//tr[@class='row']"): #用xpath解析html try: href = s.select("td[1]/a/@href").extract()[0] source_id = re.findall("(\d+)\.html", href)[0] #源站ID title = clean_title(s.select("td[1]/.//text()").extract()[0]) region = s.select("td[2]/.//text()").extract()[0].replace(u"地区", u"") category = s.select("td[3]/.//text()").extract()[0] time = s.select("td[4]/.//text()").extract()[0] time = datetime.strptime(time, "%Y-%m-%d") if not max_time: #第一条是最新更新的 max_time = time if time < min_time: #已经爬取到上次最新的数据 break data = { #详情页爬虫任务的附加数据 "title" : title, "time" : time, "category" : category, "region" : region, } #获取对应详情页爬虫的附加数据,用time字段判断该内容是否已经更新,需要重新抓取. 如果第一次创建,则数据为空 lastdata = Scheduler.get_data(AlbumCrawler.type, source_id) lasttime = lastdata.get("time", datetime.min) if lastdata else datetime.min #创建相应的专辑爬虫,爬取相应的详情页. key为源站id Scheduler.schedule( AlbumCrawler.type, source_id, data, reset = data["time"] > lasttime #是否需要强制重新抓取 ) except: self.logger.warning(get_exception_info()) #纪录错误信息并继续 continue if time and time < min_time: #已经爬取到上次最新的数据 break #获取总页数 text = hxs.select("//div[@class='pages']/span/text()").extract()[0] page_count = int(re.findall(u"\d+/(\d+)页", text)[0]) #超过总页数 if page >= page_count: break page += 1 if max_time: self.data = {'updated' : max_time} #保存上次爬取到的最新的时间
def crawl(self): channel_id = self.key channel = self.data['c_name'] list_params = self.data['c_list_param'] page = list_params['page'] pagesize = 24 now = int(time.time()) params = dict(list_params) # list_params & PARAMS_INFO merge params.update(PARAMS_INFO) while True: list_data = api_list( auto_id=channel_id, page=page, pagesize=pagesize, params=params) if list_data['returncode'] != 404: if list_data.get('cover'): for item in list_data["cover"]: source_id = item.get("c_cover_id") pubtime = item.get("c_year") checkup_time = datetime.strptime( item['c_checkup_time'], "%Y-%m-%d %H:%M:%S") checkup_time = time.mktime(checkup_time.timetuple()) data = { "source_id": source_id, "title": item.get("c_title"), "image": item.get("c_pic"), "actors": item.get("c_actor"), "directors": item.get("c_director"), "categories": item.get("c_subtype"), "channel": channel, "region": item.get("c_area"), "pubtime": pubtime, } Scheduler.schedule( type=AlbumCrawler.type, key=source_id, data=data, # checkup time whthin three hours set reset==True reset=(now - checkup_time) < 10800 ) page += 1 if list_data.get('video'): for item in list_data["video"]: source_id = item.get("c_vid") pubtime = item.get("c_ctime") data = { "source_id": source_id, "title": item.get("c_title"), "image": item.get("c_pic"), "channel": channel, "pubtime": pubtime, } Scheduler.schedule( type=AlbumCrawler.type, key=source_id, data=data, ) page += 1 else: return
def init(conf = None): if not conf: conf = {} Scheduler.schedule(TopCrawler.type, priority = conf.get('priority', Priority.High), interval = conf.get('interval', 86400))
def init(conf=None): if not conf: conf = {} for catecode in CHANNELS.keys(): Scheduler.schedule(CategoryCrawler.type, key = str(catecode), data = {"catecode" : catecode}, priority = conf.get('priority', Priority.Normal), interval = conf.get('interval', 3600))
def init(conf=None): if not conf: conf = {} for id in _CHANNEL_DCT.iterkeys(): Scheduler.schedule(CategoryCrawler.type, key=str(id), data={"cid": id}, priority=conf.get( 'priority', Priority.High), interval=conf.get('interval', 3600))
def init(conf=None): if not conf: conf = {} for channel in CHANNELS.iterkeys(): Scheduler.schedule(HistoryCrawler.type, key = channel, data = {"year" : 1900}, priority = conf.get('priority', Priority.Normal), interval = conf.get('interval',86400))
def init(conf=None): if not conf: conf = {} for cid in CHANNELS.keys(): Scheduler.schedule(CategoryCrawler.type, key=str(cid), priority=conf.get( 'priority', Priority.High), interval=conf.get('interval', 3600))