Ejemplo n.º 1
0
def crawl(settings={}, spider_name="spider", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"  #spider output output in json format

    try:
        spider_key = urlparse(
            spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
                "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception(
            "Spider or kwargs need start_urls."
        )  #try except statement to detect if no start_urls were initialised

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(  #output json data to feed folder in working directory
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )

    settings[
        'FEED_URI'] = feed_uri  #settings.py contains project settings of spider
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()  #initiate spider web crawler
Ejemplo n.º 2
0
 def __init__(self, settings):
     self.__validate_settings(settings)
     self.settings = settings
     self.spiders = settings.get('SPIDERS')
     self.register = MongoUpdatesRegister(settings)
     self.register.open_db()
     self.spider_loader = SpiderLoader(settings)
Ejemplo n.º 3
0
def crawl(settings={}, spider_name="adac", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"

    try:
        spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
            "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception("Spider or kwargs need start_urls.")

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()
Ejemplo n.º 4
0
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"
    
    try:
        spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
            "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception("Spider or kwargs need start_urls.")

    if (is_in_aws() and os.getenv("USE_S3_CACHE") !=  "0") or os.getenv("USE_S3_CACHE"):
                settings["HTTPCACHE_STORAGE"] =  "my_sls_scraper.extensions.s3cache.S3CacheStorage"
                settings["S3CACHE_URI"] =  f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache"


    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()
Ejemplo n.º 5
0
def main(egg_path=None):
    settings_module = activate_egg(egg_path)

    settings = Settings()
    settings.setmodule(settings_module)
    loader = SpiderLoader(settings)
    for spider in loader.list():
        print(spider)
Ejemplo n.º 6
0
def v2crawler():
    settings = get_project_settings()
    spider_loader = SpiderLoader(settings)
    spider_names = spider_loader.list()

    crawler_process = CrawlerProcess(settings)
    for spider_name in spider_names:
        crawler_process.crawl(spider_name)

    crawler_process.start()
Ejemplo n.º 7
0
def main(argv):
    module_name = argv[1]
    spider_name = argv[2]
    env = ''
    if len(sys.argv) > 3:
        env = argv[3]
    settings = load_conf(module_name, env)
    spider_loader = SpiderLoader(settings)
    crawler = CrawlerProcess(settings)
    spider = spider_loader.load(spider_name)
    crawler.crawl(spider)
    crawler.start()
Ejemplo n.º 8
0
def schedule():
    export_scheduler = BackgroundScheduler()#声明后台调度器
    export_scheduler.add_job(flush_news, 'interval', minutes=60)#添加作业,间隔60分钟执行flush_news
    export_scheduler.start()#开启调度器

    process = CrawlerProcess(get_project_settings())#声明爬虫进程
    sloader = SpiderLoader(get_project_settings())#爬虫存储器,获取所有的爬虫,存放sloader里面
    crawler_scheduler = TwistedScheduler()#声明一个Twisted进程,因为scrapy就是基于Twisted的爬虫框架
    for spidername in sloader.list():#对sloader里面的爬虫进行提取然后启动爬虫进程
        crawler_scheduler.add_job(process.crawl, 'interval', args=[spidername], minutes=30)#添加调度器作业,每30分钟启动爬虫进程
    crawler_scheduler.start()#启动爬虫调度器
    process.start(False)#保持进程开启
Ejemplo n.º 9
0
    def get_crawler_class(self, crawler):
        """
        Searches through the modules in self.__crawer_module for a crawler with
        the name passed along.

        :param str crawler: Name of the crawler to load
        :rtype: crawler-class
        """
        settings = Settings()
        settings.set('SPIDER_MODULES', [self.__crawer_module])
        spider_loader = SpiderLoader(settings)
        return spider_loader.load(crawler)
Ejemplo n.º 10
0
def main():
    os.environ.setdefault('SCRAPY_SETTINGS_MODULE',
                          'products_crawler.settings')
    scrapy_settings = get_project_settings()
    spider_loader = SpiderLoader(scrapy_settings)

    parse = ArgumentParser()
    parse.add_argument("brand", choices=spider_loader.list())
    parse.add_argument("start_urls", nargs="*")
    args = parse.parse_args()

    process = CrawlerProcess(scrapy_settings)
    process.crawl(args.brand, **{'start_urls': args.start_urls})
    process.start()
Ejemplo n.º 11
0
 def handle(self, *args, **options):
     settings = Settings({
         "SPIDER_MODULES": ["scraping.spiders", "scraping.spiders.ics"],
     })
     spider_loader = SpiderLoader(settings)
     # Run all spiders if none specified
     spiders = options["spider"] or spider_loader.list()
     configure_logging()
     runner = CrawlerRunner(settings=settings)
     for spider_name in spiders:
         runner.crawl(spider_loader.load(spider_name))
     deferred = runner.join()
     deferred.addBoth(lambda _: reactor.stop())
     reactor.run()
Ejemplo n.º 12
0
def start_spider(q):
    logging.info('------------ start spider ---------------')

    # 根据项目配置获取 CrawlerProcess 实例
    process = CrawlerProcess(get_project_settings())

    # 获取 spiderloader 对象,以进一步获取项目下所有爬虫名称
    spider_loader = SpiderLoader(get_project_settings())

    # 添加需要执行的爬虫
    for spidername in spider_loader.list():
        process.crawl(spidername)
    q.put(None)
    process.start()
Ejemplo n.º 13
0
 def run_spider_from_queue(self):
     if self.q.empty():
         self.remaining_spiders -= 1
         if self.remaining_spiders == 0:
             logger.debug("Stop reactor")
             reactor.stop()
         return
     blog = self.q.get()
     loader = SpiderLoader(get_project_settings())
     spidercls = loader.load(self.blogs[blog]['spider'])
     crawler = Crawler(spidercls, get_project_settings())
     crawler.signals.connect(self.spider_closed,
                             signal=signals.spider_closed)
     self.runner.crawl(crawler, **self.blogs[blog], blog_name=blog)
Ejemplo n.º 14
0
def run_spider(settings, itemcount, keyheader='', conid='', spider_id=0):
    s = Settings()
    s.setmodule(settings)
    sl = SpiderLoader(settings=s)
    print('spider list=', sl.list())
    spider = sl.load(sl.list()[spider_id])
    spider.itemcount = itemcount
    configure_logging({'LOG_LEVEL': 'DEBUG'})  # scrapy 로그 레벨 설정
    runner = CrawlerRunner(settings=s)

    crawler = runner.create_crawler(spider)
    #if sighandler != None:
    #sighandler.connect(crawler)
    d = runner.crawl(crawler, keyheader=keyheader, conid=conid)
    #d = runner.crawl(spider, keyheader=keyheader, itemcount=itemcount)
    return d
Ejemplo n.º 15
0
def crawl(spiders):
    """Crawl the desired spiders.

    Type the name or part of the name of the spider.
    Multiple spiders can be provided.
    If none is given, all spiders will be crawled.
    """
    settings = get_project_settings()
    loader = SpiderLoader(settings)

    process = CrawlerProcess(settings)
    for spider_name in loader.list():
        if not spiders or any(part for part in spiders if part in spider_name):
            process.crawl(spider_name)

    # The script will block here until the crawling is finished
    process.start()
Ejemplo n.º 16
0
def crawl(settings={}, spider_name="linksExtract", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)
    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "csv"

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s.csv"

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})
    process.crawl(spider_cls, **spider_kwargs)
    process.start()
    time.sleep(0.5)
Ejemplo n.º 17
0
def crawl(settings={}, spider_name="", key="", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)
    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "csv"
    spider_key = ""
    try:
        spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
            "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception as e:
        logging.exception("Spider or kwargs need start_urls.")
        logging.exception(e)

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/{spider_name}_{key}.csv"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()

    if is_in_aws() and has_task_token():
        import boto3
        import json
        client = boto3.client('stepfunctions')
        client.send_task_success(
            taskToken=os.getenv('TASK_TOKEN_ENV_VARIABLE'),
            output=json.dumps({"feed_uri": feed_uri})
        )
Ejemplo n.º 18
0
 def run_spider():
     s = Settings()
     s.setmodule(ulsan_settings)
     #process = CrawlerProcess(get_project_settings())
     sl = SpiderLoader(settings=s)
     print('#### spider list=', sl.list())
     spider = sl.load(sl.list()[0])
     #process = CrawlerProcess(settings=s)
     #d = process.crawl(spider)
     #process.crawl(UillOrKr)
     #process.start(stop_after_crawl=False)
     #process.start()
     #configure_logging({'LOG_FORMAT': '## %(levelname)s: %(message)s'})
     #configure_logging({'LOG_LEVEL': 'DEBUG'})
     runner = CrawlerRunner(settings=s)
     print(f'#### settings.LOG_ENABLED = {s["LOG_ENABLED"]}')
     d = runner.crawl(spider)
     #d.addBoth(lambda _: reactor.stop())
     #reactor.run()
     #return d
     return d
Ejemplo n.º 19
0
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"

    try:
        spider_key = urlparse(
            spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
                "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception("Spider or kwargs need start_urls.")

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )
    if (is_in_aws()
            and os.getenv("USE_S3_CACHE") != "0") or os.getenv("USE_S3_CACHE"):
        settings[
            "HTTPCACHE_STORAGE"] = "my_sls_scraper.extensions.s3cache.S3CacheStorage"
        settings[
            "S3CACHE_URI"] = f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache"

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()
Ejemplo n.º 20
0
#!/usr/bin/env python
from scrapy.crawler import CrawlerProcess
from threading import Thread
from scrapy.settings import Settings
from scrapy.spiderloader import SpiderLoader
import time

settings = Settings()
settings_module_path = "settings"
if settings_module_path:
    settings.setmodule(settings_module_path, priority='project')
process = CrawlerProcess(settings)

def _start_crawler_thread():
    t = Thread(target=process.start,kwargs={'stop_after_crawl': False})
    t.daemon = True
    t.start()


loader = SpiderLoader(settings)

for spider_cls in loader._spiders:

    process.crawl(spider_cls)

_start_crawler_thread()

while 1:
    time.sleep(2)

Ejemplo n.º 21
0
     # "lianjia-cj-gz",
     # "lianjia-cj-hz",
     # "lianjia-cj-nj",
     # "lianjia-cj-cs",
     # "lianjia-cj-wh",
     # "lianjia-cj-tj",
     # "lianjia-cj-zz",
     #"lianjia-cj-xa",
     #"lianjia-cj-cd",
     #"lianjia-cj-su",
     #  "lianjia-cj-cq",
     # "lianjia-cj-xm",
     # "lianjia-cj-hf",
 ])
 process = CrawlerProcess(get_project_settings())
 sloader = SpiderLoader(get_project_settings())
 scheduler = TwistedScheduler()
 hour = 3
 for spidername in sloader.list():
     # scheduler.add_job(task, 'cron', minute="*/20")
     if spidername in allow2:
         #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html
         # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour))
         # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5')
         # scheduler.add_job(func=aps_test, args=('一次性任务',),
         #                   next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12))
         # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3)
         print(spidername)
         scheduler.add_job(process.crawl,
                           'cron',
                           args=[spidername],