def __init__(self, settings=None): SpiderLoader.__init__(self, settings) self.spidercls = self.load("scrapy_spider") self.craw_runner =my_CrawlerRunner(settings) #my_crawler = craw_runner._create_crawler(spidercls) #spidercls.set_crawler(my_crawler) print "wwj debug in spidermanager",self.spidercls
def get_crawler_class(self, crawler): """ Searches through the modules in self.__crawer_module for a crawler with the name passed along. :param str crawler: Name of the crawler to load :rtype: crawler-class """ settings = Settings() settings.set('SPIDER_MODULES', [self.__crawer_module]) spider_loader = SpiderLoader(settings) return spider_loader.load(crawler)
def setUp(self): orig_spiders_dir = os.path.join(module_dir, 'test_spiders') self.tmpdir = tempfile.mkdtemp() self.spiders_dir = os.path.join(self.tmpdir, 'test_spiders_xxx') shutil.copytree(orig_spiders_dir, self.spiders_dir) sys.path.append(self.tmpdir) settings = Settings({'SPIDER_MODULES': ['test_spiders_xxx']}) self.spider_loader = SpiderLoader.from_settings(settings)
def setUp(self): orig_spiders_dir = os.path.join(module_dir, "test_spiders") self.tmpdir = self.mktemp() os.mkdir(self.tmpdir) self.spiders_dir = os.path.join(self.tmpdir, "test_spiders_xxx") shutil.copytree(orig_spiders_dir, self.spiders_dir) sys.path.append(self.tmpdir) settings = Settings({"SPIDER_MODULES": ["test_spiders_xxx"]}) self.spider_loader = SpiderLoader.from_settings(settings)
def test_bad_spider_modules_warning(self): with warnings.catch_warnings(record=True) as w: module = 'tests.test_spiderloader.test_spiders.doesnotexist' settings = Settings({'SPIDER_MODULES': [module]}) spider_loader = SpiderLoader.from_settings(settings) self.assertIn("Could not load spiders from module", str(w[0].message)) spiders = spider_loader.list() self.assertEqual(spiders, [])
def crawl(settings={}, spider_name="", key="", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "csv" spider_key = "" try: spider_key = urlparse(spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get( "start_urls") else urlparse(spider_cls.start_urls[0]).hostname except Exception as e: logging.exception("Spider or kwargs need start_urls.") logging.exception(e) if is_in_aws(): # Lambda can only write to the /tmp folder. settings['HTTPCACHE_DIR'] = "/tmp" feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/{spider_name}_{key}.csv" else: feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format( os.path.join(os.getcwd(), "feed"), spider_key, ) settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start() if is_in_aws() and has_task_token(): import boto3 import json client = boto3.client('stepfunctions') client.send_task_success( taskToken=os.getenv('TASK_TOKEN_ENV_VARIABLE'), output=json.dumps({"feed_uri": feed_uri}) )
def run_spider(): s = Settings() s.setmodule(ulsan_settings) #process = CrawlerProcess(get_project_settings()) sl = SpiderLoader(settings=s) print('#### spider list=', sl.list()) spider = sl.load(sl.list()[0]) #process = CrawlerProcess(settings=s) #d = process.crawl(spider) #process.crawl(UillOrKr) #process.start(stop_after_crawl=False) #process.start() #configure_logging({'LOG_FORMAT': '## %(levelname)s: %(message)s'}) #configure_logging({'LOG_LEVEL': 'DEBUG'}) runner = CrawlerRunner(settings=s) print(f'#### settings.LOG_ENABLED = {s["LOG_ENABLED"]}') d = runner.crawl(spider) #d.addBoth(lambda _: reactor.stop()) #reactor.run() #return d return d
def crawl(settings={}, spider_name="header_spider", spider_kwargs={}): project_settings = get_project_settings() spider_loader = SpiderLoader(project_settings) spider_cls = spider_loader.load(spider_name) feed_uri = "" feed_format = "json" try: spider_key = urlparse( spider_kwargs.get("start_urls")[0]).hostname if spider_kwargs.get( "start_urls") else urlparse(spider_cls.start_urls[0]).hostname except Exception: logging.exception("Spider or kwargs need start_urls.") if is_in_aws(): # Lambda can only write to the /tmp folder. settings['HTTPCACHE_DIR'] = "/tmp" feed_uri = f"s3://{os.getenv('FEED_BUCKET_NAME')}/%(name)s-{spider_key}.json" else: feed_uri = "file://{}/%(name)s-{}-%(time)s.json".format( os.path.join(os.getcwd(), "feed"), spider_key, ) if (is_in_aws() and os.getenv("USE_S3_CACHE") != "0") or os.getenv("USE_S3_CACHE"): settings[ "HTTPCACHE_STORAGE"] = "my_sls_scraper.extensions.s3cache.S3CacheStorage" settings[ "S3CACHE_URI"] = f"s3://{os.environ['HTTP_CACHE_BUCKET_NAME']}/cache" settings['FEED_URI'] = feed_uri settings['FEED_FORMAT'] = feed_format process = CrawlerProcess({**project_settings, **settings}) process.crawl(spider_cls, **spider_kwargs) process.start()
def test_bad_spider_modules_warning(self): with warnings.catch_warnings(record=True) as w: module = 'tests.test_spiderloader.test_spiders.doesnotexist' settings = Settings({ 'SPIDER_MODULES': [module], 'SPIDER_LOADER_WARN_ONLY': True }) spider_loader = SpiderLoader.from_settings(settings) self.assertIn("Could not load spiders from module", str(w[0].message)) spiders = spider_loader.list() self.assertEqual(spiders, [])
def test_dupename_warning(self): # copy 1 spider module so as to have duplicate spider name shutil.copyfile(os.path.join(self.tmpdir, 'test_spiders_xxx/spider3.py'), os.path.join(self.tmpdir, 'test_spiders_xxx/spider3dupe.py')) with warnings.catch_warnings(record=True) as w: spider_loader = SpiderLoader.from_settings(self.settings) self.assertEqual(len(w), 1) msg = str(w[0].message) self.assertIn("several spiders with the same name", msg) self.assertIn("'spider3'", msg) spiders = set(spider_loader.list()) self.assertEqual(spiders, set(['spider1', 'spider2', 'spider3', 'spider4']))
class Updater: REQUIRED_PARAMETERS = ['MONGO_HOST', 'MONGO_PORT', 'MONGO_DB', 'SPIDERS'] def __init__(self, settings): self.__validate_settings(settings) self.settings = settings self.spiders = settings.get('SPIDERS') self.register = MongoUpdatesRegister(settings) self.register.open_db() self.spider_loader = SpiderLoader(settings) def __validate_settings(self, settings): for parameter in Updater.REQUIRED_PARAMETERS: if parameter not in settings: raise MissingSetting(parameter) def run(self): process = CrawlerProcess(self.settings) for spider in self.spiders: kwargs = self._spider_args(spider) process.crawl(spider, **kwargs) update_id = self.register.start(self.spiders) process.start() if self._failed(process): self.register.fail(update_id) else: self.register.succeed(update_id) def _spider_args(self, spider): spider_cls = self.spider_loader.load(spider) kwargs = {} if self._accepts_last(spider_cls): last = self.register.last(spider) if last is not None: kwargs['last'] = last.start return kwargs def _accepts_last(self, cls): spider_parameters = signature(cls.__init__).parameters return 'last' in spider_parameters def _failed(self, process): finish_reasons = [crawler.stats.get_value('finish_reason') for crawler in process.crawlers] return any(reason != 'finished' for reason in finish_reasons)
def test_bad_spider_modules_warning(self): with warnings.catch_warnings(record=True) as w: module = 'tests.test_spiderloader.test_spiders.doesnotexist' settings = Settings({ 'SPIDER_MODULES': [module], 'SPIDER_LOADER_WARN_ONLY': True }) spider_loader = SpiderLoader.from_settings(settings) if str(w[0].message).startswith("_SixMetaPathImporter"): # needed on 3.10 because of https://github.com/benjaminp/six/issues/349, # at least until all six versions we can import (including botocore.vendored.six) # are updated to 1.16.0+ w.pop(0) self.assertIn("Could not load spiders from module", str(w[0].message)) spiders = spider_loader.list() self.assertEqual(spiders, [])
def cmdline_crawl(args): from scrapy.utils.project import get_project_settings from scrapy.spiderloader import SpiderLoader settings = get_project_settings() spiders = SpiderLoader.from_settings(settings) if not args: spiderlist = spiders.list() if spiderlist: print('spiders list {}'.format(spiderlist)) sys.exit() spidername = args.spider filepath = inspect.getabsfile(spiders.load(spidername)) os.environ.pop('SCRAPY_SETTINGS_MODULE') settings, _conf = _get_settings_and_conf(args) server = connection.get_redis(**settings['REDIS_PARAMS']) with open(filepath,encoding='utf-8') as f: script = f.read() jsondata = _send_script_start_work(spidername, script, server) jsondata.pop('script') print('send task:') print(json.dumps(jsondata,indent=4))
def _prepare_domains_items_refs(self): spider_loader = SpiderLoader.from_settings(self.settings) if hasattr(self, 'spiders'): spider_names = getattr(self, 'spiders').split(',') else: spider_names = [spider_name for spider_name in spider_loader.list() \ if spider_name not in self.spiders_ignored] for spider_name in spider_names: Spider = spider_loader.load(spider_name) for domain in Spider.allowed_domains: for i, item_ref in enumerate(Spider.items_refs): item_ref['spider_name'] = spider_name Spider.items_refs[i] = item_ref self.domains_items_refs[domain] = Spider.items_refs self.allowed_domains += Spider.allowed_domains self.allowed_domains.sort(key=len, reverse=True)
def test_load_base_spider(self): module = 'tests.test_spiderloader.test_spiders.spider0' settings = Settings({'SPIDER_MODULES': [module]}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 0
def test_load_spider_module_multiple(self): prefix = 'tests.test_spiderloader.test_spiders.' module = ','.join(prefix + s for s in ('spider1', 'spider2')) settings = Settings({'SPIDER_MODULES': module}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 2
def main(): logger.warn("Loading project") s3_bucket = os.environ.get('S3_BUCKET') assert s3_bucket, "Please specify an S3_BUCKET environment variable" utcnow = datetime.datetime.utcnow() tstamp = utcnow.strftime('%F-%H-%M-%S') pool_size = 12 settings = get_project_settings() spider_loader = SpiderLoader.from_settings(settings) spider_names = spider_loader.list() pool = multiprocessing.Pool(pool_size, maxtasksperchild=1) logger.info("Starting to crawl %s spiders on %s processes", len(spider_names), pool_size) results = pool.imap_unordered(run_one_spider, spider_names) pool.close() pool.join() logger.info("Done crawling") # The imap_unordered call returns an iterator, so throw it in a list results = list(results) client = boto3.client('s3') s3_key_prefix = "runs/{}".format(tstamp) # Concatenate and gzip the output geojsons _, output_gz_filename = tempfile.mkstemp('.geojson.gz') with gzip.open(output_gz_filename, 'wb') as f_out: for r in results: with open(r.pop('output_filename'), 'rb') as f_in: shutil.copyfileobj(f_in, f_out) s3_output_size = os.path.getsize(output_gz_filename) s3_output_size_mb = s3_output_size / 1024 / 1024 # Post it to S3 s3_output_key = '{}/output.geojson.gz'.format(s3_key_prefix) client.upload_file( output_gz_filename, s3_bucket, s3_output_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'application/json', 'ContentDisposition': 'attachment; filename="output-{}.geojson.gz"'.format(tstamp), }) logger.warn("Saved output to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_output_key) # Concatenate and gzip the log files _, log_gz_filename = tempfile.mkstemp('.log.gz') with gzip.open(log_gz_filename, 'wb') as f_out: for r in results: with open(r.pop('log_filename'), 'rb') as f_in: shutil.copyfileobj(f_in, f_out) # Post it to S3 s3_log_key = '{}/all_logs.txt.gz'.format(s3_key_prefix) client.upload_file(log_gz_filename, s3_bucket, s3_log_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'text/plain; charset=utf-8', 'ContentEncoding': 'gzip', }) logger.warn("Saved logfile to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_log_key) metadata = { 'spiders': results, 'links': { 'download_url': "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_output_key), 'log_url': "https://s3.amazonaws.com/{}/{}".format(s3_bucket, s3_log_key), } } with open('metadata.json', 'w') as f: json.dump(metadata, f, indent=2, default=json_serial) s3_key = '{}/metadata.json'.format(s3_key_prefix) client.upload_file('metadata.json', s3_bucket, s3_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'application/json; charset=utf-8', }) logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_key) s3_key = 'runs/latest/metadata.json' client.upload_file('metadata.json', s3_bucket, s3_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'application/json; charset=utf-8', }) logger.warn("Saved metadata to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_key) total_count = sum(filter(None, (s['item_scraped_count'] for s in results))) template_content = { 'download_url': 'https://s3.amazonaws.com/{}/{}'.format(s3_bucket, s3_output_key), 'download_size': round(s3_output_size_mb, 1), 'row_count': total_count, 'spider_count': len(results), 'updated_datetime': utcnow.replace(microsecond=0).isoformat(), } with open('info_embed.html', 'w') as f: f.write("<html><body>" "<a href=\"{download_url}\">Download</a> " "({download_size} MB)<br/><small>{row_count:,} rows from " "{spider_count} spiders, updated {updated_datetime}Z</small>" "</body></html>\n".format(**template_content)) s3_key = 'runs/latest/info_embed.html' client.upload_file('info_embed.html', s3_bucket, s3_key, ExtraArgs={ 'ACL': 'public-read', 'ContentType': 'text/html; charset=utf-8', }) logger.warn("Saved embed to https://s3.amazonaws.com/%s/%s", s3_bucket, s3_key)
from scrapy.extensions.httpcache import FilesystemCacheStorage from scrapy.spiderloader import SpiderLoader from scrapy.utils.project import get_project_settings SETTINGS = get_project_settings() STORAGE = FilesystemCacheStorage(SETTINGS) SPIDER_LOADER = SpiderLoader.from_settings(SETTINGS) SPIDER_CLASS = SPIDER_LOADER.load('ksdata') SPIDER = SPIDER_CLASS(file='../sample.csv') def get_busted_caches(): for request in SPIDER.start_requests(): meta = STORAGE._read_meta(SPIDER, request) if meta['status'] in SETTINGS['HTTPCACHE_IGNORE_HTTP_CODES']: yield STORAGE._get_request_path(SPIDER, request) if __name__ == "__main__": import shutil for path in get_busted_caches(): shutil.rmtree(path)
def test_load_spider_module(self): prefix = 'tests.test_spiderloader.test_spiders.' module = ','.join(prefix + s for s in ('spider1', 'spider2')) settings = Settings({'SPIDER_MODULES': module}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 2
#!/usr/bin/env python from scrapy.crawler import CrawlerProcess from threading import Thread from scrapy.settings import Settings from scrapy.spiderloader import SpiderLoader import time settings = Settings() settings_module_path = "settings" if settings_module_path: settings.setmodule(settings_module_path, priority='project') process = CrawlerProcess(settings) def _start_crawler_thread(): t = Thread(target=process.start,kwargs={'stop_after_crawl': False}) t.daemon = True t.start() loader = SpiderLoader(settings) for spider_cls in loader._spiders: process.crawl(spider_cls) _start_crawler_thread() while 1: time.sleep(2)
def __init__(self): settings = get_scraper_settings() loader = SpiderLoader.from_settings(settings) self.spider_names = loader.list() self.server = redis.StrictRedis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'))
from scrapy.spiderloader import SpiderLoader from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings settings = get_project_settings() process = CrawlerProcess(settings) spider_loader = SpiderLoader.from_settings(settings) ranking_spider = spider_loader.load("d1rankings") print("----------------Ranking spider started----------------") process.crawl(ranking_spider) process.start() print("----------------Ranking spider finished----------------\n")
def test_load_spider_module(self): prefix = "tests.test_spiderloader.test_spiders." module = ",".join(prefix + s for s in ("spider1", "spider2")) settings = Settings({"SPIDER_MODULES": module}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 2
def test_load_spider_module(self): module = "tests.test_spiderloader.test_spiders.spider1" settings = Settings({"SPIDER_MODULES": [module]}) self.spider_loader = SpiderLoader.from_settings(settings) assert len(self.spider_loader._spiders) == 1
# "lianjia-cj-gz", # "lianjia-cj-hz", # "lianjia-cj-nj", # "lianjia-cj-cs", # "lianjia-cj-wh", # "lianjia-cj-tj", # "lianjia-cj-zz", #"lianjia-cj-xa", #"lianjia-cj-cd", #"lianjia-cj-su", # "lianjia-cj-cq", # "lianjia-cj-xm", # "lianjia-cj-hf", ]) process = CrawlerProcess(get_project_settings()) sloader = SpiderLoader(get_project_settings()) scheduler = TwistedScheduler() hour = 3 for spidername in sloader.list(): # scheduler.add_job(task, 'cron', minute="*/20") if spidername in allow2: #https://apscheduler.readthedocs.io/en/latest/modules/triggers/cron.html # scheduler.add_job(process.crawl, 'cron', args=[spidername], hour="*/" + str(hour)) # scheduler.add_job(func=aps_test, args=('定时任务',), trigger='cron', second='*/5') # scheduler.add_job(func=aps_test, args=('一次性任务',), # next_run_time=datetime.datetime.now() + datetime.timedelta(seconds=12)) # scheduler.add_job(func=aps_test, args=('循环任务',), trigger='interval', seconds=3) print(spidername) scheduler.add_job(process.crawl, 'cron', args=[spidername],
from scrapy.spiderloader import SpiderLoader from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings if __name__ == '__main__': spider_loader = SpiderLoader(get_project_settings()) spiders = spider_loader.list() process = CrawlerProcess(get_project_settings()) for spider in spiders: process.crawl(spider_loader.load(spider)) process.start()
def list_spiders(self): loader = SpiderLoader.from_settings(self.settings) return loader.list()