Exemple #1
0
def run_spider(settings, itemcount, keyheader='', conid='', spider_id=0):
    s = Settings()
    s.setmodule(settings)
    sl = SpiderLoader(settings=s)
    print('spider list=', sl.list())
    spider = sl.load(sl.list()[spider_id])
    spider.itemcount = itemcount
    configure_logging({'LOG_LEVEL': 'DEBUG'})  # scrapy 로그 레벨 설정
    runner = CrawlerRunner(settings=s)

    crawler = runner.create_crawler(spider)
    #if sighandler != None:
    #sighandler.connect(crawler)
    d = runner.crawl(crawler, keyheader=keyheader, conid=conid)
    #d = runner.crawl(spider, keyheader=keyheader, itemcount=itemcount)
    return d
Exemple #2
0
def main(egg_path=None):
    settings_module = activate_egg(egg_path)

    settings = Settings()
    settings.setmodule(settings_module)
    loader = SpiderLoader(settings)
    for spider in loader.list():
        print(spider)
Exemple #3
0
def v2crawler():
    settings = get_project_settings()
    spider_loader = SpiderLoader(settings)
    spider_names = spider_loader.list()

    crawler_process = CrawlerProcess(settings)
    for spider_name in spider_names:
        crawler_process.crawl(spider_name)

    crawler_process.start()
Exemple #4
0
 def run_spider():
     s = Settings()
     s.setmodule(ulsan_settings)
     #process = CrawlerProcess(get_project_settings())
     sl = SpiderLoader(settings=s)
     print('#### spider list=', sl.list())
     spider = sl.load(sl.list()[0])
     #process = CrawlerProcess(settings=s)
     #d = process.crawl(spider)
     #process.crawl(UillOrKr)
     #process.start(stop_after_crawl=False)
     #process.start()
     #configure_logging({'LOG_FORMAT': '## %(levelname)s: %(message)s'})
     #configure_logging({'LOG_LEVEL': 'DEBUG'})
     runner = CrawlerRunner(settings=s)
     print(f'#### settings.LOG_ENABLED = {s["LOG_ENABLED"]}')
     d = runner.crawl(spider)
     #d.addBoth(lambda _: reactor.stop())
     #reactor.run()
     #return d
     return d
Exemple #5
0
def schedule():
    export_scheduler = BackgroundScheduler()#声明后台调度器
    export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news
    export_scheduler.start()#开启调度器

    process = CrawlerProcess(get_project_settings())#声明爬虫进程
    sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面
    crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架
    for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程
        crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程
    crawler_scheduler.start()#启动爬虫调度器
    process.start(False)#保持进程开启
Exemple #6
0
def main():
    os.environ.setdefault('SCRAPY_SETTINGS_MODULE',
                          'products_crawler.settings')
    scrapy_settings = get_project_settings()
    spider_loader = SpiderLoader(scrapy_settings)

    parse = ArgumentParser()
    parse.add_argument("brand", choices=spider_loader.list())
    parse.add_argument("start_urls", nargs="*")
    args = parse.parse_args()

    process = CrawlerProcess(scrapy_settings)
    process.crawl(args.brand, **{'start_urls': args.start_urls})
    process.start()
 def handle(self, *args, **options):
     settings = Settings({
         "SPIDER_MODULES": ["scraping.spiders", "scraping.spiders.ics"],
     })
     spider_loader = SpiderLoader(settings)
     # Run all spiders if none specified
     spiders = options["spider"] or spider_loader.list()
     configure_logging()
     runner = CrawlerRunner(settings=settings)
     for spider_name in spiders:
         runner.crawl(spider_loader.load(spider_name))
     deferred = runner.join()
     deferred.addBoth(lambda _: reactor.stop())
     reactor.run()
Exemple #8
0
def start_spider(q):
    logging.info('------------ start spider ---------------')

    # 根据项目配置获取 CrawlerProcess 实例
    process = CrawlerProcess(get_project_settings())

    # 获取 spiderloader 对象,以进一步获取项目下所有爬虫名称
    spider_loader = SpiderLoader(get_project_settings())

    # 添加需要执行的爬虫
    for spidername in spider_loader.list():
        process.crawl(spidername)
    q.put(None)
    process.start()
def crawl(spiders):
    """Crawl the desired spiders.

    Type the name or part of the name of the spider.
    Multiple spiders can be provided.
    If none is given, all spiders will be crawled.
    """
    settings = get_project_settings()
    loader = SpiderLoader(settings)

    process = CrawlerProcess(settings)
    for spider_name in loader.list():
        if not spiders or any(part for part in spiders if part in spider_name):
            process.crawl(spider_name)

    # The script will block here until the crawling is finished
    process.start()
Exemple #10
0
     # "lianjia-cj-cs",
     # "lianjia-cj-wh",
     # "lianjia-cj-tj",
     # "lianjia-cj-zz",
     #"lianjia-cj-xa",
     #"lianjia-cj-cd",
     #"lianjia-cj-su",
     #  "lianjia-cj-cq",
     # "lianjia-cj-xm",
     # "lianjia-cj-hf",
 ])
 process = CrawlerProcess(get_project_settings())
 sloader = SpiderLoader(get_project_settings())
 scheduler = TwistedScheduler()
 hour = 3
 for spidername in sloader.list():
     # scheduler.add_job(task, 'cron', minute="*/20")
     if spidername in allow2:
         #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
         # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour))
         # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5')
         # scheduler.add_job(func=aps_test, args=('一次性任务',),
         #                   next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12))
         # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3)
         print(spidername)
         scheduler.add_job(process.crawl,
                           'cron',
                           args=[spidername],
                           next_run_time=datetime.datetime.now() +
                           datetime.timedelta(hours=3))
         hour += 2
from scrapy.spiderloader import SpiderLoader
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

if __name__ == '__main__':
    spider_loader = SpiderLoader(get_project_settings())
    spiders = spider_loader.list()

    process = CrawlerProcess(get_project_settings())
    for spider in spiders:
        process.crawl(spider_loader.load(spider))
    process.start()