Ejemplo n.º 1
0
def crawl(settings={}, spider_name="adac", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"

    try:
        spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
            "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception("Spider or kwargs need start_urls.")

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()
Ejemplo n.º 2
0
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"
    
    try:
        spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
            "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception("Spider or kwargs need start_urls.")

    if (is_in_aws() and os.getenv("USE_S3_CACHE") !=  "0") or os.getenv("USE_S3_CACHE"):
                settings["HTTPCACHE_STORAGE"] =  "my_sls_scraper.extensions.s3cache.S3CacheStorage"
                settings["S3CACHE_URI"] =  f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache"


    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()
Ejemplo n.º 3
0
def crawl(settings={}, spider_name="spider", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"  #spider output output in json format

    try:
        spider_key = urlparse(
            spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
                "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception(
            "Spider or kwargs need start_urls."
        )  #try except statement to detect if no start_urls were initialised

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(  #output json data to feed folder in working directory
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )

    settings[
        'FEED_URI'] = feed_uri  #settings.py contains project settings of spider
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()  #initiate spider web crawler
Ejemplo n.º 4
0
    def get_crawler_class(self, crawler):
        """
        Searches through the modules in self.__crawer_module for a crawler with
        the name passed along.

        :param str crawler: Name of the crawler to load
        :rtype: crawler-class
        """
        settings = Settings()
        settings.set('SPIDER_MODULES', [self.__crawer_module])
        spider_loader = SpiderLoader(settings)
        return spider_loader.load(crawler)
Ejemplo n.º 5
0
def main(argv):
    module_name = argv[1]
    spider_name = argv[2]
    env = ''
    if len(sys.argv) > 3:
        env = argv[3]
    settings = load_conf(module_name, env)
    spider_loader = SpiderLoader(settings)
    crawler = CrawlerProcess(settings)
    spider = spider_loader.load(spider_name)
    crawler.crawl(spider)
    crawler.start()
    def get_crawler_class(self, crawler):
        """
        Searches through the modules in self.__crawer_module for a crawler with
        the name passed along.

        :param str crawler: Name of the crawler to load
        :rtype: crawler-class
        """
        settings = Settings()
        settings.set('SPIDER_MODULES', [self.__crawer_module])
        spider_loader = SpiderLoader(settings)
        return spider_loader.load(crawler)
Ejemplo n.º 7
0
 def run_spider_from_queue(self):
     if self.q.empty():
         self.remaining_spiders -= 1
         if self.remaining_spiders == 0:
             logger.debug("Stop reactor")
             reactor.stop()
         return
     blog = self.q.get()
     loader = SpiderLoader(get_project_settings())
     spidercls = loader.load(self.blogs[blog]['spider'])
     crawler = Crawler(spidercls, get_project_settings())
     crawler.signals.connect(self.spider_closed,
                             signal=signals.spider_closed)
     self.runner.crawl(crawler, **self.blogs[blog], blog_name=blog)
Ejemplo n.º 8
0
 def handle(self, *args, **options):
     settings = Settings({
         "SPIDER_MODULES": ["scraping.spiders", "scraping.spiders.ics"],
     })
     spider_loader = SpiderLoader(settings)
     # Run all spiders if none specified
     spiders = options["spider"] or spider_loader.list()
     configure_logging()
     runner = CrawlerRunner(settings=settings)
     for spider_name in spiders:
         runner.crawl(spider_loader.load(spider_name))
     deferred = runner.join()
     deferred.addBoth(lambda _: reactor.stop())
     reactor.run()
Ejemplo n.º 9
0
def run_spider(settings, itemcount, keyheader='', conid='', spider_id=0):
    s = Settings()
    s.setmodule(settings)
    sl = SpiderLoader(settings=s)
    print('spider list=', sl.list())
    spider = sl.load(sl.list()[spider_id])
    spider.itemcount = itemcount
    configure_logging({'LOG_LEVEL': 'DEBUG'})  # scrapy 로그 레벨 설정
    runner = CrawlerRunner(settings=s)

    crawler = runner.create_crawler(spider)
    #if sighandler != None:
    #sighandler.connect(crawler)
    d = runner.crawl(crawler, keyheader=keyheader, conid=conid)
    #d = runner.crawl(spider, keyheader=keyheader, itemcount=itemcount)
    return d
Ejemplo n.º 10
0
class Updater:

    REQUIRED_PARAMETERS = ['MONGO_HOST', 'MONGO_PORT', 'MONGO_DB', 'SPIDERS']

    def __init__(self, settings):
        self.__validate_settings(settings)
        self.settings = settings
        self.spiders = settings.get('SPIDERS')
        self.register = MongoUpdatesRegister(settings)
        self.register.open_db()
        self.spider_loader = SpiderLoader(settings)

    def __validate_settings(self, settings):
        for parameter in Updater.REQUIRED_PARAMETERS:
            if parameter not in settings:
                raise MissingSetting(parameter)

    def run(self):
        process = CrawlerProcess(self.settings)
        for spider in self.spiders:
            kwargs = self._spider_args(spider)
            process.crawl(spider, **kwargs)
        update_id = self.register.start(self.spiders)
        process.start()
        if self._failed(process):
            self.register.fail(update_id)
        else:
            self.register.succeed(update_id)

    def _spider_args(self, spider):
        spider_cls = self.spider_loader.load(spider)
        kwargs = {}
        if self._accepts_last(spider_cls):
            last = self.register.last(spider)
            if last is not None:
                kwargs['last'] = last.start
        return kwargs

    def _accepts_last(self, cls):
        spider_parameters = signature(cls.__init__).parameters
        return 'last' in spider_parameters

    def _failed(self, process):
        finish_reasons = [crawler.stats.get_value('finish_reason') for crawler in process.crawlers]
        return any(reason != 'finished' for reason in finish_reasons)
Ejemplo n.º 11
0
def crawl(settings={}, spider_name="linksExtract", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)
    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "csv"

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s.csv"

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})
    process.crawl(spider_cls, **spider_kwargs)
    process.start()
    time.sleep(0.5)
Ejemplo n.º 12
0
def crawl(settings={}, spider_name="", key="", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)
    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "csv"
    spider_key = ""
    try:
        spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
            "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception as e:
        logging.exception("Spider or kwargs need start_urls.")
        logging.exception(e)

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/{spider_name}_{key}.csv"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()

    if is_in_aws() and has_task_token():
        import boto3
        import json
        client = boto3.client('stepfunctions')
        client.send_task_success(
            taskToken=os.getenv('TASK_TOKEN_ENV_VARIABLE'),
            output=json.dumps({"feed_uri": feed_uri})
        )
Ejemplo n.º 13
0
 def run_spider():
     s = Settings()
     s.setmodule(ulsan_settings)
     #process = CrawlerProcess(get_project_settings())
     sl = SpiderLoader(settings=s)
     print('#### spider list=', sl.list())
     spider = sl.load(sl.list()[0])
     #process = CrawlerProcess(settings=s)
     #d = process.crawl(spider)
     #process.crawl(UillOrKr)
     #process.start(stop_after_crawl=False)
     #process.start()
     #configure_logging({'LOG_FORMAT': '## %(levelname)s: %(message)s'})
     #configure_logging({'LOG_LEVEL': 'DEBUG'})
     runner = CrawlerRunner(settings=s)
     print(f'#### settings.LOG_ENABLED = {s["LOG_ENABLED"]}')
     d = runner.crawl(spider)
     #d.addBoth(lambda _: reactor.stop())
     #reactor.run()
     #return d
     return d
Ejemplo n.º 14
0
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}):
    project_settings = get_project_settings()
    spider_loader = SpiderLoader(project_settings)

    spider_cls = spider_loader.load(spider_name)

    feed_uri = ""
    feed_format = "json"

    try:
        spider_key = urlparse(
            spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get(
                "start_urls") else urlparse(spider_cls.start_urls[0]).hostname
    except Exception:
        logging.exception("Spider or kwargs need start_urls.")

    if is_in_aws():
        # Lambda can only write to the /tmp folder.
        settings['HTTPCACHE_DIR'] = "/tmp"
        feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json"
    else:
        feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format(
            os.path.join(os.getcwd(), "feed"),
            spider_key,
        )
    if (is_in_aws()
            and os.getenv("USE_S3_CACHE") != "0") or os.getenv("USE_S3_CACHE"):
        settings[
            "HTTPCACHE_STORAGE"] = "my_sls_scraper.extensions.s3cache.S3CacheStorage"
        settings[
            "S3CACHE_URI"] = f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache"

    settings['FEED_URI'] = feed_uri
    settings['FEED_FORMAT'] = feed_format

    process = CrawlerProcess({**project_settings, **settings})

    process.crawl(spider_cls, **spider_kwargs)
    process.start()
Ejemplo n.º 15
0
from scrapy.spiderloader import SpiderLoader
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

if __name__ == '__main__':
    spider_loader = SpiderLoader(get_project_settings())
    spiders = spider_loader.list()

    process = CrawlerProcess(get_project_settings())
    for spider in spiders:
        process.crawl(spider_loader.load(spider))
    process.start()