class scheduler(): def __init__(self): self.modulePath = "AutoCourseInfo_Scrapy.CourseInfoExtract.CourseInfoExtract.spiders." self.sched = TwistedScheduler() self.process = CrawlerRunner(get_project_settings()) def addJob(self, spiderModulePath, spiderClass, scheduleTime): # Create Spider Object dynamically by importing module. try: module = self.modulePath + spiderModulePath module = importlib.import_module(module) class_ = getattr(module, spiderClass) instance = class_() self.sched.add_job(self.process.crawl, 'date', args=[instance], run_date=scheduleTime) except (Exception) as error: print(error) def runJob(self): try: self.sched.start() d = self.process.join() d.addBoth(lambda _: reactor.stop()) reactor.run() except (Exception) as error: print(error)
def __init__(self): self.scrapers = [ HistorySpider, WpbccSpider, LWVChicago, LibraryEvents, GreatLakesReader ] self.interval_seconds = 60 * config.schedule_interval self.scheduler = TwistedScheduler() self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED)
def setup(cls, conf): if "sql" in conf: DatabaseMixin.sql = \ adbapi.ConnectionPool("psycopg2", host=conf["sql"]['host'], database=conf["sql"]['database'], user=conf["sql"]['username'], password=conf["sql"]['password'], cp_min=1, cp_max=10, cp_reconnect=True, cp_noisy=conf["debug"]) if "redis" in conf: DatabaseMixin.redis = \ cyclone.redis.lazyConnectionPool( host=conf["redis"]['host'], dbid=conf["redis"]['dbid'], poolsize=10, reconnect=True) if conf["redis"].get("pubsub", False): pubsub = cyclone.redis.SubscriberFactory() pubsub.maxDelay = 20 pubsub.continueTrying = True pubsub.protocol = PubSubProtocol reactor.connectTCP(conf["redis"]['host'], 6379, pubsub) DatabaseMixin.sched = TwistedScheduler() DatabaseMixin.build() DatabaseMixin.sched.start()
def twisted_schedule(): from twisted.internet import reactor from apscheduler.schedulers.twisted import TwistedScheduler def tick(): print('Tick! The time is: %s' % datetime.now()) scheduler = TwistedScheduler() scheduler.add_job(tick, 'interval', seconds=3) scheduler.start() print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed. try: reactor.run() except (KeyboardInterrupt, SystemExit): pass
def __init__(self, root, local_items): resource.Resource.__init__(self) self.root = root self.local_items = local_items self.spider_status_dic = {} logging.basicConfig() self.scheduler = TwistedScheduler() self.scheduler.start()
def startYourEngines(self): sched = TwistedScheduler() sched.start() if sched.get_job('host_status'): pass else: sched.add_job(self.hoststatus, 'interval', seconds=10, id='host_status')
def trigger_spider_job(seconds=10, source_type="1", source_key="jd", document_item=YJdItem): scheduler = TwistedScheduler() trigger = CronTrigger(hour=10, minute=42, second=seconds) scheduler.add_job(print_time, trigger, args=[source_type, source_key, document_item] , misfire_grace_time=120) scheduler.start()
def main(): process = CrawlerProcess(get_project_settings()) process.crawl(PlaysportCrawler) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', hours=3, args=[PlaysportCrawler]) scheduler.add_listener(my_listener, EVENT_JOB_EXECUTED | EVENT_JOB_ERROR) scheduler.start() process.start(False) _ = _notifier(msg='\n'.join([ "Scheduler Start", ]))
def create_scheduler(reactor): jobstores = {"default": MemoryJobStore()} executors = {"default": TwistedExecutor()} job_defaults = { "coalesce": False, "max_instances": 1, "misfire_grace_time": 10 } return TwistedScheduler( jobstores=jobstores, executors=executors, job_defautls=job_defaults, reactor=reactor, )
def log(self, logdata, retry=True): logdata = self.sanitizeLog(logdata) jsondata = json.dumps(logdata, sort_keys=True) if logdata['src_host']!='127.0.0.1' and logdata['dst_host']!='': import uuid scheduler = TwistedScheduler() scheduler.add_job(self.post2server, args=[self.serverip, jsondata], id=str(uuid.uuid1())) scheduler.start() elif logdata['src_host']!='127.0.0.1': self.logger.warn(jsondata)
def __init__(self, scheduler): ''' https://apscheduler.readthedocs.io/en/latest/userguide.html?highlight=add_job Parameters ---------- scheduler: [str] 调度器,根据开发需求选择相应的调度器 'BlockingScheduler' 阻塞式调度器: 适用于只跑调度器的程序 'BackgroundScheduler' 后台调度器: 适用于非阻塞的情况,调度器会在后台独立运行 'AsyncIOScheduler' AsyncIO调度器: 适用于应用使用AsnycIO的情况 'GeventScheduler' Gevent调度器: 适用于应用通过Gevent的情况 'TornadoScheduler' Tornado调度器: 适用于构建Tornado应用 'TwistedScheduler' Twisted调度器: 适用于构建Twisted应用 'QtScheduler' Qt调度器: 适用于构建Qt应用 ''' import logging logging.basicConfig() scheduler = str(scheduler).lower() if ('blocking' in scheduler): from apscheduler.schedulers.blocking import BlockingScheduler self.scheduler = BlockingScheduler() elif ('background' in scheduler): from apscheduler.schedulers.background import BackgroundScheduler self.scheduler = BackgroundScheduler() elif ('asyncio' in scheduler): from apscheduler.schedulers.asyncio import AsyncIOScheduler self.scheduler = AsyncIOScheduler() elif ('gevent' in scheduler): from apscheduler.schedulers.gevent import GeventScheduler self.scheduler = GeventScheduler() elif ('tornado' in scheduler): from apscheduler.schedulers.tornado import TornadoScheduler self.scheduler = TornadoScheduler() elif ('twisted' in scheduler): from apscheduler.schedulers.twisted import TwistedScheduler self.scheduler = TwistedScheduler() elif ('qt' in scheduler): from apscheduler.schedulers.qt import QtScheduler self.scheduler = QtScheduler()
def run(self, args, opts): settings = get_project_settings() crawler_process = CrawlerProcess(settings) scheduler = TwistedScheduler() for spider_name in crawler_process.spider_loader.list(): if spider_name in self.excludes: continue spider_cls = crawler_process.spider_loader.load(spider_name) scheduler.add_job(crawler_process.crawl, 'interval', args=[spider_cls], seconds=86400) scheduler.start() crawler_process.start(False)
def schedule(): export_scheduler = BackgroundScheduler()#声明后台调度器 export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news export_scheduler.start()#开启调度器 process = CrawlerProcess(get_project_settings())#声明爬虫进程 sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面 crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架 for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程 crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程 crawler_scheduler.start()#启动爬虫调度器 process.start(False)#保持进程开启
def schedule(self): scheduler = TwistedScheduler( {'apscheduler.timezone': self.settings.get('APP_TIMEZONE')}) # TODO: use random interval switch = { 'debug': lambda: scheduler.add_job(self.run_crawler, 'interval', seconds=3), 'hourly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=3600), 'daily': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400), 'weekly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 7), 'monthly': lambda: scheduler.add_job( self.run_crawler, 'interval', seconds=86400 * 30), } switch[self.settings.get('APP_CRAWL_INTERVAL')]() scheduler.start()
class Scheduler: def __init__(self): self.scrapers = [ HistorySpider, WpbccSpider, LWVChicago, LibraryEvents, GreatLakesReader ] self.interval_seconds = 60 * config.schedule_interval self.scheduler = TwistedScheduler() self.scheduler.add_listener(self.schedule_missed, EVENT_JOB_MISSED) def add_schedule(self, scraper, seconds_delay): self.scheduler.add_job(self.run_scraper, id=scraper.__name__, trigger='interval', args=[scraper], start_date=datetime.now() + relativedelta(seconds=seconds_delay), seconds=self.interval_seconds) def schedule_missed(self, event): print(f'{event.job_id} missed. Interval time: {self.interval_seconds}') def run_scraper(self, scraper): start_date = datetime.now().strftime('%m-%d-%Y') end_date = (datetime.now() + relativedelta(months=+1)).strftime('%m-%d-%Y') print(f'{datetime.now()} starting {scraper.__name__}') runner = CrawlerRunner(get_project_settings()) runner.crawl(scraper, start_date, end_date) runner.join() def run_schedule(self): configure_logging() start_interval = self.interval_seconds / len(self.scrapers) now = datetime.now() self.last_scheduled = now for index, scraper in enumerate(self.scrapers): self.add_schedule(scraper, start_interval * index) self.scheduler.start() reactor.run()
'ITEM_PIPELINES': { 'pipelines.FilterPipeline': 300, 'pipelines.SaveReviewPipeline': 400 }, 'DOWNLOADER_MIDDLEWARES': { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }, 'POSTGRES_HOST': 'localhost', 'POSTGRES_PORT': '25432', 'POSTGRES_DB': 'mob', 'POSTGRES_USER': '******', 'POSTGRES_PASSWORD': '******' }) if len(sys.argv) == 1: scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', args=[ReviewSpider, lambda: start_objs()], seconds=45) scheduler.start() process.start(False) else: process.crawl(ReviewSpider, lambda: start_objs()) process.start()
from core import toLog from core.log_manager.log_levels import setup_logger setup_logger('apscheduler', path_apscheduler, logging.DEBUG) executors = { 'default': ThreadPoolExecutor(threadpool_executor), 'processpool': ProcessPoolExecutor(processpool_executor) } job_defaults = { 'coalesce': coalesce, 'max_instances': max_instances } scheduler = TwistedScheduler(timezone=local_tz) scheduler.add_jobstore( 'mongodb', host=MONGO_HOST_SELF, port=MONGO_PORT_SELF, collection=CORE_ID ) scheduler.add_executor( ThreadPoolExecutor(threadpool_executor), 'default' ) scheduler.add_executor( ProcessPoolExecutor(processpool_executor), 'processpool' )
from config.settings import path_apscheduler from config.settings import processpool_executor from config.settings import threadpool_executor from core import toLog from core.log_manager.log_levels import setup_logger setup_logger('apscheduler', path_apscheduler, logging.DEBUG) executors = { 'default': ThreadPoolExecutor(threadpool_executor), 'processpool': ProcessPoolExecutor(processpool_executor) } job_defaults = {'coalesce': coalesce, 'max_instances': max_instances} scheduler = TwistedScheduler(timezone=local_tz) # scheduler.add_jobstore( # 'mongodb', # host=MONGO_HOST_SELF, # port=MONGO_PORT_SELF, # collection=CORE_ID # ) scheduler.add_executor(ThreadPoolExecutor(threadpool_executor), 'default') scheduler.add_executor(ProcessPoolExecutor(processpool_executor), 'processpool') scheduler.start()
# "lianjia-cj-hz", # "lianjia-cj-nj", # "lianjia-cj-cs", # "lianjia-cj-wh", # "lianjia-cj-tj", # "lianjia-cj-zz", #"lianjia-cj-xa", #"lianjia-cj-cd", #"lianjia-cj-su", # "lianjia-cj-cq", # "lianjia-cj-xm", # "lianjia-cj-hf", ]) process = CrawlerProcess(get_project_settings()) sloader = SpiderLoader(get_project_settings()) scheduler = TwistedScheduler() hour = 3 for spidername in sloader.list(): # scheduler.add_job(task, 'cron', minute="*/20") if spidername in allow2: #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour)) # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5') # scheduler.add_job(func=aps_test, args=('一次性任务',), # next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12)) # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3) print(spidername) scheduler.add_job(process.crawl, 'cron', args=[spidername], next_run_time=datetime.datetime.now() +
player_5 = player_5.first() else: check = False # if all players exists then create the team if check: team_model = Team.objects.create(name=team_dict["Name"], start_date=team_dict["Start_Date"], end_date=team_dict["End_Date"], Player_1=player_1, Player_2=player_2, Player_3=player_3, Player_4=player_4, Player_5=player_5, winning_percentage=team_dict["Winning_Percentage"]) team_model.save() def get_dates_from_response(self, response): """this method returns the start date and end date as datetime object from a response""" params_string = response.url.split("?")[-1] params_string = params_string.split("&") start_date = params_string[0].split("=")[-1] start_date = datetime.strptime(start_date, "%Y-%m-%d") end_date = params_string[1].split("=")[-1] end_date = datetime.strptime(end_date, "%Y-%m-%d") return start_date, end_date if __name__ == "__main__": process = CrawlerProcess(get_project_settings()) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, args=[StatsSpider]) scheduler.add_job(process.crawl, 'interval', args=[StatsSpider], seconds=60 * 60 * 6) scheduler.start() process.start(False)
parser = argparse.ArgumentParser(description="股票行情爬虫") parser.add_argument("--cron", type=str, const=False, nargs='?', help="是否依照指定的时间执行,默认为False") args = parser.parse_args() cron = str2bool(args.cron, False) process = CrawlerProcess(get_project_settings()) # 设置日志级别 logging.getLogger('scrapy.core.scraper').setLevel(logging.WARNING) if not cron: sequence_run() process.start() else: scheduler = TwistedScheduler() scheduler.add_job(sequence_run, 'cron', day_of_week='mon-fri', hour='9-15', minute='0/30') scheduler.start() process.start(False)
""" Demonstrates how to use the Twisted compatible scheduler to schedule a job that executes on 3 second intervals. """ from datetime import datetime import os from pytz import utc from twisted.internet import reactor from apscheduler.schedulers.twisted import TwistedScheduler def tick(): print('Tick! The time is: %s' % datetime.now()) if __name__ == '__main__': scheduler = TwistedScheduler(timezone=utc) scheduler.add_job(tick, 'interval', seconds=3) scheduler.start() print('Press Ctrl+{0} to exit'.format('Break' if os.name == 'nt' else 'C')) # Execution will block here until Ctrl+C (Ctrl+Break on Windows) is pressed. try: reactor.run() except (KeyboardInterrupt, SystemExit): pass
def start(verbose, debug, proxy, min, product, brand, serie, check, delay, news, days): def check_db(): from DuTracker.tsdb import influxdb try: influxdb.ping() except Exception as e: log.error(f'InfluxDB 连接错误') sys.exit(1) else: log.success(f'InfluxDB 连接成功') if check: check_db() # https://stackoverflow.com/questions/44228851/scrapy-on-a-schedule settings = get_project_settings() if verbose: log.setLevel(logging.DEBUG) if proxy: settings['DOWNLOADER_MIDDLEWARES'].update( {'DuTracker.middlewares.RandomProxy': 760}) settings['PROXY_URL'] = proxy if debug: settings['LOG_ENABLED'] = True if delay: settings['DOWNLOAD_DELAY'] = delay process = CrawlerProcess(settings) sched = TwistedScheduler() if brand: sched.add_job(process.crawl, 'interval', args=[BrandSpider], kwargs={ 'auto': True, 'Ids': brand }, days=1) process.crawl(BrandSpider, auto=True, Ids=brand) if serie: sched.add_job(process.crawl, 'interval', args=[SerieSpider], kwargs={ 'auto': True, 'Ids': serie }, days=1) process.crawl(SerieSpider, auto=True, Ids=serie) if brand or serie: sched.add_job(process.crawl, 'interval', args=[ProductSpider], kwargs={'fromDB': True}, days=1) process.crawl(ProductSpider, fromDB=True) process.crawl(TrackerSpider, soldNum_min=min, Ids=product) sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'soldNum_min': min, 'Ids': product }, hours=6) if news: sched.add_job(process.crawl, 'interval', args=[TrackerSpider], kwargs={ 'newItem': True, 'days': days }, hours=1) sched.add_job(sched.print_jobs, 'interval', hours=6) log.info('开始商品价格追踪') sched.start() process.start(False)
from scrapy.crawler import CrawlerProcess from spiders.zhihu import ZhihuSpider from scrapy.utils.project import get_project_settings from apscheduler.schedulers.twisted import TwistedScheduler process = CrawlerProcess(get_project_settings()) sched = TwistedScheduler() sched.add_job(process.crawl, 'interval', args=[ZhihuSpider], seconds=300) sched.start() process.start(False) # Do not stop reactor after spider closes
# # @defer.inlineCallbacks # def crawl(): # yield runner.crawl(AnjukeSpider) # yield runner.crawl(BeikeSpider) # yield runner.crawl(LianjiaSpider) # reactor.stop() # # crawl() # reactor.run() # the script will block here until the last crawl call is finished from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from apscheduler.schedulers.twisted import TwistedScheduler from scrapy_jojozu.spiders.anjuke import AnjukeSpider from scrapy_jojozu.spiders.beike import BeikeSpider from scrapy_jojozu.spiders.lianjia import LianjiaSpider from scrapy_jojozu.spiders.fangtianxia import FangSpider from scrapy_jojozu.spiders.douban import DoubanSpider process = CrawlerProcess(get_project_settings()) scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', args=[AnjukeSpider], minutes=30), scheduler.add_job(process.crawl, 'interval', args=[BeikeSpider], minutes=30), scheduler.add_job(process.crawl, 'interval', args=[FangSpider], minutes=30), scheduler.add_job(process.crawl, 'interval', args=[LianjiaSpider], minutes=30), scheduler.add_job(process.crawl, "interval", args=[DoubanSpider], minutes=30) scheduler.start() process.start(False)
import json from klein import Klein, route, run from scrapy import signals from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from apscheduler.schedulers.twisted import TwistedScheduler import datetime from meituan_spider.spiders.meituan_article import MeituanArticleSpider app = Klein() scheduler = TwistedScheduler() # crawl_job = None class MyCrawlerRunner(CrawlerRunner): """ Crawler object that collects items and returns output after finishing crawl. """ def crawl(self, crawler_or_spidercls, *args, **kwargs): # keep all items scraped self.items = [] # create crawler (Same as in base CrawlerProcess) crawler = self.create_crawler(crawler_or_spidercls) # handle each item scraped crawler.signals.connect(self.item_scraped, signals.item_scraped) # create Twisted.Deferred launching crawl
def run_scheduler(flask_app): scheduler = TwistedScheduler() JobsAdder(scheduler, flask_app).add_jobs() scheduler.start()
from apscheduler.schedulers.twisted import TwistedScheduler from twisted.internet import reactor from twisted.internet.defer import inlineCallbacks from autobahn.twisted.wamp import ApplicationSession scheduler = TwistedScheduler() scheduler.add_job(tick, 'interval', seconds=3) scheduler.start()
class Spiders(resource.Resource): u""" 显示Spider的工作状态以及调度计划 """ config = Config() def __init__(self, root, local_items): resource.Resource.__init__(self) self.root = root self.local_items = local_items self.spider_status_dic = {} logging.basicConfig() self.scheduler = TwistedScheduler() self.scheduler.start() def get_spider_status(self, project): spider_status = self.spider_status_dic.get(project) if not spider_status: spider_status = dict((spider, {'status': 'finished', 'timestamp': None, 'job': None, 'schedule_job': None}) for spider in get_spider_list(project)) self.spider_status_dic[project] = spider_status self._update_spider_status(project) return spider_status def _update_spider_status(self, project): u""" 先获取目前任务调度情况,然后再获取apsheduler中的任务。 """ spider_status = self.spider_status_dic.get(project) for project, queue in self.root.poller.queues.items(): for m in queue.list(): spider = m['name'] job = m['_job'] spider_status[spider]['status'] = 'pending' spider_status[spider]['timestamp'] = None spider_status[spider]['job'] = job for p in self.root.launcher.processes.values(): spider = p.spider spider_status[spider]['status'] = 'running' spider_status[spider]['timestamp'] = p.start_time spider_status[spider]['job'] = p.job for p in self.root.launcher.finished: spider = p.spider spider_status[spider]['status'] = 'finished' spider_status[spider]['timestamp'] = p.end_time spider_status[spider]['job'] = p.job for spider in spider_status: status = spider_status[spider] sjob = self.scheduler.get_job(spider) status['schedule_job'] = sjob if sjob: status['next_time'] = sjob.next_run_time else: status['next_time'] = None # sjob._get_run_times() def render_GET(self, txrequest): args = dict((k, v[0]) for k, v in txrequest.args.items()) project = args.pop('project', 'careerTalk') spider_status = self.get_spider_status(project) content = "<tr>" for th in ['spider', 'status', 'timestamp', 'next_time', 'data']: content += "<th>%s</th>" % th content += "</tr>" for spider in spider_status: status = spider_status[spider] content += "<tr>" content += "<td>%s</td><td>%s</td><td>%s</td><td>%s</td>" \ % (spider, status['status'], status['timestamp'], status['next_time']) content += "<td><a href='/data/%s/'>data</a></td>" % spider content += "</tr>" sub_form = "<form action='' method='post'><input type='submit' value='开启所有任务'></input></form>" html = "<table>"+content+"</table>"+sub_form return html def render_POST(self, txrequest): args = dict((k, v[0]) for k, v in txrequest.args.items()) project = args.pop('project', 'careerTalk') # spiders = ['NJU', 'BIT', 'ECUST', 'RUC'] spiders = get_spider_list(project) tstart = dt.datetime.utcnow() for spider in spiders: job = self.scheduler.add_job(spider_crawl, 'interval', minutes=60, replace_existing=True, id=spider, next_run_time=tstart, args=[project, spider]) tstart = tstart + dt.timedelta(seconds=5) return "<span>任务全部开启</span><a href='/'>返回</a>"
configure_logging() session = DBSession() config = session.query(Config).filter(Config.id == 1).one() runner = CrawlerRunner(get_scrapy_settings(config)) @defer.inlineCallbacks def crawl(rule): dot = rule.spider_clsass.rindex('.') module, name = rule.spider_clsass[:dot], rule.spider_clsass[dot + 1:] if (module == GeneralSpider.__module__ and name == GeneralSpider.__name__): yield runner.crawl(GeneralSpider, rule) sched = TwistedScheduler() # sched.add_jobstore('sqlalchemy', url='mysql+pymysql://root:djejeUJ3qj^[email protected]:3306/apscheduler') spiderRules = session.query(SpiderRule).filter(SpiderRule.enable, SpiderRule.cron).all() for sr in spiderRules: if sr.cron: sched.add_job(crawl, CronTrigger.from_crontab(sr.cron), args=[sr], name=sr.name, id='%s' % sr.id) # else: # sched.add_job(crawl, 'date', args=[sr], name=sr.name, id='%s'%sr.id) session.close()
from config.settings import path_apscheduler from config.settings import processpool_executor from config.settings import threadpool_executor from core import toLog from core.log_manager.log_levels import setup_logger setup_logger('apscheduler', path_apscheduler, logging.DEBUG) executors = { 'default': ThreadPoolExecutor(threadpool_executor), 'processpool': ProcessPoolExecutor(processpool_executor) } job_defaults = {'coalesce': coalesce, 'max_instances': max_instances} scheduler = TwistedScheduler(timezone=local_tz) scheduler.add_jobstore('mongodb', host=MONGO_HOST_SELF, port=MONGO_PORT_SELF, collection=CORE_ID) scheduler.add_executor(ThreadPoolExecutor(threadpool_executor), 'default') scheduler.add_executor(ProcessPoolExecutor(processpool_executor), 'processpool') scheduler.start() def job_logger(event): if event.code > 512:
title = response.xpath("//article[@class='detail']/div[@class='detail__header']/h1/text()").get() paragraphs = response.xpath("//div[@class='detail__body itp_bodycontent_wrapper']/div[@class='detail__body-text itp_bodycontent']/p/text()").extract() if(paragraphs == None): return content = "" for paragraph in paragraphs: content = content + paragraph database = connection["scrapingdb"] collection = database["detik_articles"] collection.insert_one({ "title":title, "content":content, "url": response.url }) connection.close() yield { "message" : f"success: {response.url}" } process = CrawlerProcess() # process.crawl(MySpider) # process.start() scheduler = TwistedScheduler() scheduler.add_job(process.crawl, 'interval', args=[MySpider], seconds=5) scheduler.start() process.start(False)
# "lianjia-cj-cs", # "lianjia-cj-wh", # "lianjia-cj-tj", # "lianjia-cj-zz", #"lianjia-cj-xa", #"lianjia-cj-cd", #"lianjia-cj-su", # "lianjia-cj-cq", # "lianjia-cj-xm", # "lianjia-cj-hf", ]) process = CrawlerProcess(get_project_settings()) runner = CrawlerRunner(get_project_settings()) # runner.crawl() sloader = SpiderLoader(get_project_settings()) scheduler = TwistedScheduler() hour = 1 for spidername in sloader.list(): # scheduler.add_job(task, 'cron', minute="*/20") if spidername in allow: #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour)) # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5') # scheduler.add_job(func=aps_test, args=('一次性任务',), # next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12)) # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3) print(spidername) scheduler.add_job(process.crawl, trigger='date', args=[spidername], run_date=datetime.datetime(2018, 9, 10, hour, 0, 0)) # scheduler.add_job(process.crawl, trigger='cron', args=[spidername], # year='*', month='*', day=9, week='*', day_of_week='*', hour=hour, minute=20, second=0)
from camping.spiders.choansan_spider import ChoansanSpider from camping.spiders.gangdong_spider import GangdongSpider from camping.spiders.joongrangsoop_spider import JoongrangsoopSpiderSpider from camping.spiders.imjingak_spider import ImjingakSpider from camping.spiders.pyeongtaek_spider import PyeongtaekSpider from camping.spiders.campunak_spider import CampunakSpider from datetime import date from datetime import timedelta import calendar # if __name__ == '__main__': try: process = CrawlerProcess(get_project_settings()) scheduler = TwistedScheduler() # scheduler.add_job(process.crawl, 'interval', args=[ChoansanSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[GangdongSpider], seconds=10) scheduler.add_job(process.crawl, 'interval', args=[JoongrangsoopSpiderSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[ImjingakSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[PyeongtaekSpider], seconds=15) # scheduler.add_job(process.crawl, 'interval', args=[CampunakSpider], seconds=15) scheduler.start() process.start(False) except (KeyboardInterrupt, SystemExit): print("stop process")
import logging logging.basicConfig(format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S',filename='list.log', filemode='w') n = 6000 ll = [0 for i in range(n)] def func(i): ll[i] += 1 def err_lis(ev): logger = logging.getLogger("") logger.error(str(ev)) #scheduler = BackgroundScheduler() scheduler = TwistedScheduler() for i in range(n): start = datetime.datetime.now() + datetime.timedelta(seconds=i%10) scheduler.add_job(func, 'interval', args=(i,), start_date=start, seconds=10) scheduler.add_listener(err_lis, apscheduler.events.EVENT_JOB_ERROR | apscheduler.events.EVENT_JOB_MISSED) scheduler.start() time.sleep(5) scheduler.shutdown() s = 0 for i in ll: s+=i print s