class scheduler(): def __init__(self): self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders." self.sched = TwistedScheduler() self.process = CrawlerRunner(get_project_settings()) def addJob(self, spiderModulePath, spiderClass, scheduleTime): # Create Spider Object dynamically by importing module. try: module = self.modulePath + spiderModulePath module = importlib.import_module(module) class_ = getattr(module, spiderClass) instance = class_() self.sched.add_job(self.process.crawl, 'date', args=[instance], run_date=scheduleTime) except (Exception) as error: print(error) def runJob(self): try: self.sched.start() d = self.process.join() d.addBoth(lambda _: reactor.stop()) reactor.run() except (Exception) as error: print(error)
def trigger_spider_job(seconds=10, source_type="1", source_key="jd", document_item=YJdItem): scheduler = TwistedScheduler() trigger = CronTrigger(hour=10, minute=42, second=seconds) scheduler.add_job(print_time, trigger, args=[source_type, source_key, document_item] , misfire_grace_time=120) scheduler.start()
def startYourEngines(self): sched = TwistedScheduler() sched.start() if sched.get_job('host_status'): pass else: sched.add_job(self.hoststatus, 'interval', seconds=10, id='host_status')
def log(self, logdata, retry=True): logdata = self.sanitizeLog(logdata) jsondata = json.dumps(logdata, sort_keys=True) if logdata['src_host']!='127.0.0.1' and logdata['dst_host']!='': import uuid scheduler = TwistedScheduler() scheduler.add_job(self.post2server, args=[self.serverip, jsondata], id=str(uuid.uuid1())) scheduler.start() elif logdata['src_host']!='127.0.0.1': self.logger.warn(jsondata)
def schedule(): export_scheduler = BackgroundScheduler()#声明后台调度器 export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news export_scheduler.start()#开启调度器 process = CrawlerProcess(get_project_settings())#声明爬虫进程 sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面 crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架 for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程 crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程 crawler_scheduler.start()#启动爬虫调度器 process.start(False)#保持进程开启
def run(self, args, opts): settings = get_project_settings() crawler_process = CrawlerProcess(settings) scheduler = TwistedScheduler() for spider_name in crawler_process.spider_loader.list(): if spider_name in self.excludes: continue spider_cls = crawler_process.spider_loader.load(spider_name) scheduler.add_job(crawler_process.crawl, 'interval', args=[spider_cls], seconds=86400) scheduler.start() crawler_process.start(False)
def main(): process = CrawlerProcess(get_project_settings()) process.crawl(PlaysportCrawler) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', hours=3, args=[PlaysportCrawler]) scheduler.add_listener(my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) scheduler.start() process.start(False) _ = _notifier(msg='\n'.join([ "Scheduler Start", ]))
def twisted_schedule(): from twisted.internet import reactor from apscheduler.schedulers.twisted import TwistedScheduler def tick(): print('Tick! The time is: %s' % datetime.now()) scheduler = TwistedScheduler() scheduler.add_job(tick, 'interval', seconds=3) scheduler.start() print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed. try: reactor.run() except (KeyboardInterrupt, SystemExit): pass
def twisted_schedule(): from twisted.internet import reactor from apscheduler.schedulers.twisted import TwistedScheduler def tick(): print('Tick! The time is: %s' % datetime.now()) scheduler = TwistedScheduler() scheduler.add_job(tick, 'interval', seconds=3) scheduler.start() print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed. try: reactor.run() except (KeyboardInterrupt, SystemExit): pass
class Scheduler: def __init__(self): self.scrapers = [ HistorySpider, WpbccSpider, LWVChicago, LibraryEvents, GreatLakesReader ] self.interval_seconds = 60 * config.schedule_interval self.scheduler = TwistedScheduler() self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED) def add_schedule(self, scraper, seconds_delay): self.scheduler.add_job(self.run_scraper, id=scraper.__name__, trigger='interval', args=[scraper], start_date=datetime.now() + relativedelta(seconds=seconds_delay), seconds=self.interval_seconds) def schedule_missed(self, event): print(f'{event.job_id} missed. Interval time: {self.interval_seconds}') def run_scraper(self, scraper): start_date = datetime.now().strftime('%m-%d-%Y') end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y') print(f'{datetime.now()} starting {scraper.__name__}') runner = CrawlerRunner(get_project_settings()) runner.crawl(scraper, start_date, end_date) runner.join() def run_schedule(self): configure_logging() start_interval = self.interval_seconds / len(self.scrapers) now = datetime.now() self.last_scheduled = now for index, scraper in enumerate(self.scrapers): self.add_schedule(scraper, start_interval * index) self.scheduler.start() reactor.run()
def schedule(self): scheduler = TwistedScheduler( {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')}) # TODO: use random interval switch = { 'debug': lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3), 'hourly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=3600), 'daily': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400), 'weekly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 7), 'monthly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 30), } switch[self.settings.get('APP_CRAWL_INTERVAL')]() scheduler.start()
isLeaf = True def render_GET(self, request): # self.count +=1 sessionweb = DBSession() spiderRule = sessionweb.query(SpiderRule).filter( SpiderRule.enable and SpiderRule.cron == None).one() # spiderRule.name = '%s-%s'%(spiderRule.name,self.count) # print('我来了%s次!!!'%spiderRule.name) self.scheduler.add_job(crawl, 'date', args=[spiderRule], name=spiderRule.name, id='%s' % spiderRule.id, replace_existing=True) sessionweb.close() request.setHeader("Content-Type", "text/html; charset=utf-8") return ("<html>Hello, world!</html>").encode('utf-8') def render_POST(self, request): pass site = server.Site(Simple(sched)) endpoint = endpoints.TCP4ServerEndpoint(reactor, 8080) endpoint.listen(site) sched.start() reactor.run()
def trigger_spider_job(spider, seconds=10): scheduler = TwistedScheduler() # 每天凌晨12点 开始执行 trigger = CronTrigger(hour=0, minute=19, second=seconds) scheduler.add_job(runner.crawl, trigger, args=[spider]) scheduler.start()
class Spiders(resource.Resource): u""" 显示Spider的工作状态以及调度计划 """ config = Config() def __init__(self, root, local_items): resource.Resource.__init__(self) self.root = root self.local_items = local_items self.spider_status_dic = {} logging.basicConfig() self.scheduler = TwistedScheduler() self.scheduler.start() def get_spider_status(self, project): spider_status = self.spider_status_dic.get(project) if not spider_status: spider_status = dict((spider, {'status': 'finished', 'timestamp': None, 'job': None, 'schedule_job': None}) for spider in get_spider_list(project)) self.spider_status_dic[project] = spider_status self._update_spider_status(project) return spider_status def _update_spider_status(self, project): u""" 先获取目前任务调度情况,然后再获取apsheduler中的任务。 """ spider_status = self.spider_status_dic.get(project) for project, queue in self.root.poller.queues.items(): for m in queue.list(): spider = m['name'] job = m['_job'] spider_status[spider]['status'] = 'pending' spider_status[spider]['timestamp'] = None spider_status[spider]['job'] = job for p in self.root.launcher.processes.values(): spider = p.spider spider_status[spider]['status'] = 'running' spider_status[spider]['timestamp'] = p.start_time spider_status[spider]['job'] = p.job for p in self.root.launcher.finished: spider = p.spider spider_status[spider]['status'] = 'finished' spider_status[spider]['timestamp'] = p.end_time spider_status[spider]['job'] = p.job for spider in spider_status: status = spider_status[spider] sjob = self.scheduler.get_job(spider) status['schedule_job'] = sjob if sjob: status['next_time'] = sjob.next_run_time else: status['next_time'] = None # sjob._get_run_times() def render_GET(self, txrequest): args = dict((k, v[0]) for k, v in txrequest.args.items()) project = args.pop('project', 'careerTalk') spider_status = self.get_spider_status(project) content = "<tr>" for th in ['spider', 'status', 'timestamp', 'next_time', 'data']: content += "<th>%s</th>" % th content += "</tr>" for spider in spider_status: status = spider_status[spider] content += "<tr>" content += "<td>%s</td><td>%s</td><td>%s</td><td>%s</td>" \ % (spider, status['status'], status['timestamp'], status['next_time']) content += "<td><a href='/data/%s/'>data</a></td>" % spider content += "</tr>" sub_form = "<form action='' method='post'><input type='submit' value='开启所有任务'></input></form>" html = "<table>"+content+"</table>"+sub_form return html def render_POST(self, txrequest): args = dict((k, v[0]) for k, v in txrequest.args.items()) project = args.pop('project', 'careerTalk') # spiders = ['NJU', 'BIT', 'ECUST', 'RUC'] spiders = get_spider_list(project) tstart = dt.datetime.utcnow() for spider in spiders: job = self.scheduler.add_job(spider_crawl, 'interval', minutes=60, replace_existing=True, id=spider, next_run_time=tstart, args=[project, spider]) tstart = tstart + dt.timedelta(seconds=5) return "<span>任务全部开启</span><a href='/'>返回</a>"
def start(verbose, debug, proxy, min, product, brand, serie, check, delay, news, days): def check_db(): from DuTracker.tsdb import influxdb try: influxdb.ping() except Exception as e: log.error(f'InfluxDB 连接错误') sys.exit(1) else: log.success(f'InfluxDB 连接成功') if check: check_db() # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule settings = get_project_settings() if verbose: log.setLevel(logging.DEBUG) if proxy: settings['DOWNLOADER_MIDDLEWARES'].update( {'DuTracker.middlewares.RandomProxy': 760}) settings['PROXY_URL'] = proxy if debug: settings['LOG_ENABLED'] = True if delay: settings['DOWNLOAD_DELAY'] = delay process = CrawlerProcess(settings) sched = TwistedScheduler() if brand: sched.add_job(process.crawl, 'interval', args=[BrandSpider], kwargs={ 'auto': True, 'Ids': brand }, days=1) process.crawl(BrandSpider, auto=True, Ids=brand) if serie: sched.add_job(process.crawl, 'interval', args=[SerieSpider], kwargs={ 'auto': True, 'Ids': serie }, days=1) process.crawl(SerieSpider, auto=True, Ids=serie) if brand or serie: sched.add_job(process.crawl, 'interval', args=[ProductSpider], kwargs={'fromDB': True}, days=1) process.crawl(ProductSpider, fromDB=True) process.crawl(TrackerSpider, soldNum_min=min, Ids=product) sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'soldNum_min': min, 'Ids': product }, hours=6) if news: sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'newItem': True, 'days': days }, hours=1) sched.add_job(sched.print_jobs, 'interval', hours=6) log.info('开始商品价格追踪') sched.start() process.start(False)
from apscheduler.schedulers.twisted import TwistedScheduler from twisted.internet import reactor from twisted.internet.defer import inlineCallbacks from autobahn.twisted.wamp import ApplicationSession scheduler = TwistedScheduler() scheduler.add_job(tick, 'interval', seconds=3) scheduler.start()
# the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. from __future__ import print_function from logging import basicConfig, ERROR from apscheduler.schedulers.twisted import TwistedScheduler basicConfig(level=ERROR) SCHEDULER = TwistedScheduler() SCHEDULER.start() def ms(mills): """ Converts milliseconds to seconds """ return mills * 0.001
def run_scheduler(flask_app): scheduler = TwistedScheduler() JobsAdder(scheduler, flask_app).add_jobs() scheduler.start()
mysql_client = SQLServer.from_settings(settings, cf.get("MYSQL_SERVER", "type"), db=cf.get("MYSQL_SERVER", "db")) sql = "select COLUMN_NAME from information_schema.COLUMNS where table_name = 'CRAWLER_SPIDER_TASK';" column_name_list = [x[0] for x in mysql_client.select(sql)] # 查询Task表中的所有列名 sql = "SELECT {} FROM `CRAWLER_SPIDER_TASK` WHERE isUse=1;".format(",".join(column_name_list)) site_info_dict_list = [] for site_info in mysql_client.select(sql): # 查询所有当前要触发的任务,并转换格式 item = {} for i, x in enumerate(column_name_list): item[x] = site_info[i] site_info_dict_list.append(item) for site_info in site_info_dict_list: site_info["cf"] = cf settings.set("CONCURRENT_REQUESTS", site_info.get("CONCURRENT_REQUESTS", 16), priority="project") crawler_process.crawl(site_info["SpiderName"], **site_info) if __name__ == '__main__': settings = get_project_settings() crawler_process = CrawlerProcess(settings) Scheduler = TwistedScheduler() if get_current_ip() != settings.get("MASTER_HOST", ""): print(get_current_ip()) print(settings.get("MASTER_HOST", "")) print("消费者集群") runAllSpiderConsume() else: # RunCrawlerServer() Scheduler.add_job(func=CreateTask, trigger='interval', seconds=2, args=(settings,), id='Test') Scheduler._logger = logger Scheduler.start() reactor.run()
# if __name__ == '__main__': try: process = CrawlerProcess(get_project_settings()) scheduler = TwistedScheduler() # scheduler.add_job(process.crawl, 'interval', args=[ChoansanSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[GangdongSpider], seconds=10) scheduler.add_job(process.crawl, 'interval', args=[JoongrangsoopSpiderSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[ImjingakSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[PyeongtaekSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[CampunakSpider], seconds=15) scheduler.start() process.start(False) except (KeyboardInterrupt, SystemExit): print("stop process") def getSaturday(): # today = date.today() # print(today) # print(today.weekday()) # offset = (today.weekday() - 5)%7 # print(offset) # print(timedelta(days=offset)) # last_saturday = today - timedelta(days=offset) # print(last_saturday)
def trigger_spider_job(spider, seconds=10): scheduler = TwistedScheduler() start_time = datetime.datetime.now() + datetime.timedelta(seconds=seconds) trigger = IntervalTrigger(hours=8, start_date=start_time) scheduler.add_job(runner.crawl, trigger, args=[spider]) scheduler.start()