def run(self, args, opts): basedir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def _keyfunc(module): return module.__name__.rsplit('.', 1)[-1].split('_', 1)[0] with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f: f.write( dedent("""\ Spiders ======= .. Do not edit this file. Instead, run: `scrapy updatedocs` """)) for key, group in groupby( walk_modules('kingfisher_scrapy.spiders'), _keyfunc): if key in ('spiders', 'test'): continue f.write('\n{}\n{}\n'.format(key.capitalize(), '-' * len(key))) for module in group: for cls in iter_spider_classes(module): f.write( '\n.. autoclass:: {}.{}\n :no-members:\n'.format( module.__name__, cls.__name__))
def __init__(self, settings): self.elastic = Elastic.from_settings(settings) # get spiders in /product_spider/spiders and /product_spider/datafeeds folder # and generate spider's redis_key into spiders_rediskey self.spiders_rediskey = {} spider_modules = settings.getlist('SPIDER_MODULES') for name in spider_modules: try: for module in walk_modules(name): for spcls in iter_spider_classes(module): if spcls.name and getattr(spcls, 'crawler_type', None) == 'spider' and \ hasattr(spcls, 'redis_key'): self.spiders_rediskey[spcls.name] = spcls.redis_key logger.info('search spider:%(name)s, key=%(key)s', { 'name': spcls.name, 'key': spcls.redis_key }) except ImportError: pass self.req_session = requests.Session() self.group_items_api = settings.get( 'SEARCH_API') + '/se/spider-update/group/' self.group_items_cache_ttl = 86400
class Command(ScrapyCommand): requires_project = False def syntax(self): return "[options] <spider_file>" def short_desc(self): return "Run a self-contained spider (without creating a project)" def long_desc(self): return "Run the spider defined in the given file" def add_options(self, parser): ScrapyCommand.add_options(self, parser) parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", \ help="set spider argument (may be repeated)") parser.add_option("-o", "--output", metavar="FILE", \ help="dump scraped items into FILE (use - for stdout)") parser.add_option("-t", "--output-format", metavar="FORMAT", default="jsonlines", \ help="format to use for dumping items with -o (default: %default)") def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) if opts.output: if opts.output == '-': self.settings.overrides['FEED_URI'] = 'stdout:' else: self.settings.overrides['FEED_URI'] = opts.output valid_output_formats = self.settings['FEED_EXPORTERS'].keys( ) + self.settings['FEED_EXPORTERS_BASE'].keys() if opts.output_format not in valid_output_formats: raise UsageError( 'Invalid/unrecognized output format: %s, Expected %s' % (opts.output_format, valid_output_formats)) self.settings.overrides['FEED_FORMAT'] = opts.output_format def run(self, args, opts): if len(args) != 1: raise UsageError() filename = args[0] if not os.path.exists(filename): raise UsageError("File not found: %s\n" % filename) try: module = _import_file(filename) except (ImportError, ValueError), e: raise UsageError("Unable to load %r: %s\n" % (filename, e)) spclasses = list(iter_spider_classes(module)) if not spclasses: raise UsageError("No spider found in file: %s\n" % filename) spider = spclasses.pop()(**opts.spargs) crawler = self.crawler_process.create_crawler() crawler.crawl(spider) self.crawler_process.start()
class Command(ScrapyCommand): requires_project = False def syntax(self): return "[options] <spider_file>" def short_desc(self): return "Run a self-contained spider (without creating a project)" def long_desc(self): return "Run the spider defined in the given file" def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) def run(self, args, opts): if len(args) != 1: raise UsageError() filename = args[0] if not os.path.exists(filename): raise UsageError("File not found: %s\n" % filename) try: module = _import_file(filename) except (ImportError, ValueError), e: raise UsageError("Unable to load %r: %s\n" % (filename, e)) spclasses = list(iter_spider_classes(module)) if not spclasses: raise UsageError("No spider found in file: %s\n" % filename) spider = spclasses.pop()() # schedule spider and start engine self.crawler.queue.append_spider(spider) self.crawler.start()
def find_spider_cls(spider_name, spider_packages): """查找spider""" for package_name in spider_packages: for module in walk_modules(package_name): for spider_cls in iter_spider_classes(module): if spider_cls.name == spider_name: return spider_cls
def crawl_in_loop(self, runner): """在循环中爬取""" # 遍历取出 spider spider_list = [] for spider_class in iter_spider_classes(regex_proxy_spider): ip_count = getattr(spider_class, 'ip_count', 0) if ip_count > 0: spider_list.append(spider_class) all_loop = ProxyCounter() single_loop = ProxyCounter() # 开始时起动,每轮结束后计数 all_loop.start() # 无限循环 loop_times = 0 while loop_times >= 0: loop_times += 1 # 每轮开始时启动,每个爬虫结束时计数 single_loop.start() while single_loop.available.start_num > 100: print(f'有效 ip {single_loop.available.start_num} 个,休息 10 分钟') time.sleep(60 * 10) single_loop.start() # 开始时的数量 log.info(f'第 {loop_times} 轮爬取开始') # 爬取 for i in range(len(spider_list)): spider = spider_list[i] log.info( f'第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,' f'{single_loop.print_count()}') spider = spider_list[i] try: yield runner.crawl(spider) except SystemExit: pass sleep_time = 10 divider = '-' * 10 single_loop.count() log.info( f'{divider}第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,' f'{single_loop.print_count()} {divider}') log.info(f'等待执行下一爬虫,sleep {sleep_time}') time.sleep(sleep_time) # 延时下一轮 sleep_time = 60 log.info(f'本轮爬取结束,等待下一轮,sleep {sleep_time}') all_loop.count() log.info(all_loop.print_count()) time.sleep(sleep_time) # noinspection PyUnresolvedReferences reactor.stop()
def get_spider_class(spider_name, project_settings): spider_modules = project_settings.get('SPIDER_MODULES') for spider_module in spider_modules: modules = walk_modules(spider_module) for module in islice(modules, 1, None): for spider_class in iter_spider_classes(module): if spider_class.name == spider_name: return spider_class return None
def _load_spiders(self, module): """ 加载模块中的蜘蛛 inspect.isclass(obj) 是类 issubclass(obj, Spider) 是Spider的子类 obj.__module__ == module.__name__ getattr(obj, 'name', None) name不为空 """ for spcls in iter_spider_classes(module): self._spiders[spcls.name] = spcls
def assert_good_spider_type(settings, spider_name): spider_modules = settings['SPIDER_MODULES'] if isinstance(spider_modules, basestring): spider_modules = [spider_modules] spider_modules = settings['SPIDER_MODULES'] for spider_module in spider_modules: for module in walk_modules(spider_module): for spcls in iter_spider_classes(module): if spcls.name == spider_name: if not issubclass(spcls, ForumSpider): raise Exception('Spider %s is not a Forum Spider. Please use the right script for your spider.' % spider_name)
def find_spider_cls(spider_name, spider_packages): """ Find spider class which name is equal to `spider_name` argument :param spider_name: spider name to look for :param spider_packages: a list of package names that will be searched for spider classes """ for package_name in spider_packages: for module in walk_modules(package_name): for spider_cls in iter_spider_classes(module): if spider_cls.name == spider_name: return spider_cls
def run(self, args, opts): #settings = get_project_settings() #spider_loader = self.crawler_process.spider_loader crawler = self.crawler_process.create_crawler() names = crawler.spiders.list() spclasses = [] for spidername in names: crawler = self.crawler_process.create_crawler() module = _import_file("./eastmoney/spiders/"+spidername+"_spider.py") #spclasses = spclasses+list(iter_spider_classes(module)) spider = list(iter_spider_classes(module)).pop()(**opts.spargs) crawler.crawl(spider) self.crawler_process.start() crawler._spider = None
def run(self, args, opts): if len(args) != 1: raise UsageError() spidername = args[0] spider_module = __import__file(spidername) sp_classes = list(iter_spider_classes(spider_module)) if not sp_classes: raise UsageError("No spider found : %s\n" % spidername) spidercls = sp_classes.pop() self.crawler_process.crawl(spidercls, **opts.__dict__) self.crawler_process.start() if self.crawler_process.bootstrap_failed: self.exitcode = 1
def run(self, args, opts): basedir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def _keyfunc(module): module_name = module.__name__.rsplit('.', 1)[-1] if module_name.startswith(('costa_rica', 'dominican_republic')): return '_'.join(module_name.split('_', 2)[:2]) return module_name.split('_', 1)[0] with open(os.path.join(basedir, 'docs', 'spiders.rst')) as f: lines = [] for line in f: lines.append(line) if line.startswith('.. Do not edit past this line.'): break with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f: for line in lines: f.write(line) for key, group in groupby( walk_modules('kingfisher_scrapy.spiders'), _keyfunc): if key in ('spiders', 'fail'): continue f.write( f"\n{key.replace('_', ' ').title()}\n{'-' * len(key)}\n") for module in group: for cls in iter_spider_classes(module): f.write( f'\n.. autoclass:: {module.__name__}.{cls.__name__}\n :no-members:\n' ) infix = '' if cls.__doc__: section = re.search( r'^Environment variables\n(.+?)(?:^\S|\Z)', dedent(cls.__doc__), re.MULTILINE | re.DOTALL) if section: environment_variables = re.findall( r'^(\S.+)\n ', dedent(section[1]), re.MULTILINE) infix = f"env {' '.join([f'{variable}=...' for variable in environment_variables])} " f.write('\n.. code-block:: bash\n') f.write( f"\n {infix}scrapy crawl {module.__name__.rsplit('.')[-1]}\n" )
def enumerate_spider_classes(): original_cd = os.getcwd() imported_settings = sys.modules.pop("settings", None) for spider_project in SpiderProject.objects.all(): os.chdir(spider_project.path) os.environ.pop(ENVVAR, None) # force get_project_settings() to reconsider the current directory project_settings = get_project_settings() for module_or_package_name in project_settings.get("SPIDER_MODULES"): for module in walk_modules(module_or_package_name): for spider_cls in iter_spider_classes(module): yield (spider_project, spider_cls) if imported_settings is not None: sys.modules["settings"] = imported_settings os.chdir(original_cd)
def run(self, args, opts): #settings = get_project_settings() #spider_loader = self.crawler_process.spider_loader crawler = self.crawler_process.create_crawler() names = crawler.spiders.list() spclasses = [] for spidername in names: crawler = self.crawler_process.create_crawler() module = _import_file("./eastmoney/spiders/" + spidername + "_spider.py") #spclasses = spclasses+list(iter_spider_classes(module)) spider = list(iter_spider_classes(module)).pop()(**opts.spargs) crawler.crawl(spider) self.crawler_process.start() crawler._spider = None
def run(args): print(args) if len(args) != 1: raise UsageError() scrapy.utils.log.configure_logging() spidername = args[0] spider_module = __import__file(spidername) sp_classes = list(iter_spider_classes(spider_module)) if not sp_classes: raise UsageError("No spider found : %s\n" % spidername) spidercls = sp_classes.pop() settings = get_project_settings() process = CrawlerProcess(settings=settings) process.crawl(spidercls) process.start()
def run(self, args, opts): if len(args) != 1: raise UsageError() filename = args[0] if not os.path.exists(filename): raise UsageError("File not found: %s\n" % filename) try: module = _import_file(filename) except (ImportError, ValueError) as e: raise UsageError("Unable to load %r: %s\n" % (filename, e)) spclasses = list(iter_spider_classes(module)) if not spclasses: raise UsageError("No spider found in file: %s\n" % filename) spidercls = spclasses.pop() self.crawler_process.crawl(spidercls, **opts.spargs) self.crawler_process.start()
def _load_spider(self, module, spider): for spcls in iter_spider_classes(module): spider_name = spcls.name if spider_name != spider: continue spmdl = self._get_spider_db_model(spcls.name, ['crawl_method2']) if spmdl and spmdl.crawl_method2 and spmdl.crawl_method2.crawl_method: new_spcls = self._configure_spider_class(spcls, spmdl) if new_spcls is not None and inspect.isclass(new_spcls) and ( issubclass(new_spcls, BaseSpider) or issubclass(new_spcls, Spider)): spcls = new_spcls setattr(spcls, 'name', spider_name) self._spiders[spider_name] = spcls break
class Command(ScrapyCommand): requires_project = False def syntax(self): return "[options] <spider_file>" def short_desc(self): return "Run a self-contained spider (without creating a project)" def long_desc(self): return "Run the spider defined in the given file" def add_options(self, parser): ScrapyCommand.add_options(self, parser) parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", \ help="set spider argument (may be repeated)") def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) def run(self, args, opts): if len(args) != 1: raise UsageError() filename = args[0] if not os.path.exists(filename): raise UsageError("File not found: %s\n" % filename) try: module = _import_file(filename) except (ImportError, ValueError), e: raise UsageError("Unable to load %r: %s\n" % (filename, e)) spclasses = list(iter_spider_classes(module)) if not spclasses: raise UsageError("No spider found in file: %s\n" % filename) spider = spclasses.pop()(**opts.spargs) self.crawler.crawl(spider) self.crawler.start()
def run(self, args, opts): if len(args) != 1: raise UsageError() filename = args[0] if not os.path.exists(filename): raise UsageError(f"File not found: {filename}\n") try: module = _import_file(filename) except (ImportError, ValueError) as e: raise UsageError(f"Unable to load {filename!r}: {e}\n") spclasses = list(iter_spider_classes(module)) if not spclasses: raise UsageError(f"No spider found in file: {filename}\n") spidercls = spclasses.pop() self.crawler_process.crawl(spidercls, **opts.spargs) self.crawler_process.start() if self.crawler_process.bootstrap_failed: self.exitcode = 1
def find_spiders(type=None): """Find all classes that subclass scrapy.Spider If type is given then the output is filtered by type. Possible type values ['vk', 'site']. """ spider_map = {} def _get_spiders(spiders, spider_map): """Returns a list of all spiders with unique name found in a module If 2 spiders with the same name are found, that subclass one another, then the child one is taken (based on mro) """ # if two spiders with the same name are found, then take the one that # subclasses the autogenerated for s in spiders: if s.name in spider_map: # leave only the one that subclasses parent old = spider_map[s.name] if old in s.mro(): spider_map[s.name] = s else: spider_map[s.name] = s # the same one as passed with new values return spider_map for module in misc.walk_modules(settings.NEWSPIDER_MODULE): # crawl responsibly spiders = [s for s in spider.iter_spider_classes(module) if s.type == type and type or not type] _get_spiders(spiders, spider_map) # add user generated modules user_spiders = autogenerate.load_spiders_from_json( settings.USER_SPIDERS_FILE) _get_spiders(user_spiders, spider_map) # check for name uniqueness return spider_map.values()
def test_iter_spider_classes(self): import scrapy.tests.test_utils_spider it = iter_spider_classes(scrapy.tests.test_utils_spider) self.assertEqual(set(it), {MySpider1, MySpider2})
def _load_spiders(self, module): for spcls in iter_spider_classes(module): self.spiders.append(spcls.name)
def crawl_in_loop(self, runner): """在循环中爬取""" # 遍历取出 spider spider_list = [] for spider_class in iter_spider_classes(regex_proxy_spider): ip_count = getattr(spider_class, 'ip_count', 0) if ip_count > 0: spider_list.append(spider_class) loop_times = 0 loop_end_count = 0 all_loop_proxy_count = 0 """整个循环中爬取的代理总数""" # 无限循环 while loop_times >= 0: loop_times += 1 # 开始时的数量 if loop_end_count == 0: # 首次获取 loop_start_count = proxy_manager.count() else: # 取循环结束时的获取 loop_start_count = loop_end_count log.info(f'第 {loop_times} 轮爬取开始,当前 ip 共 {loop_start_count} 个') # 爬取 spider_end_count = 0 for i in range(len(spider_list)): spider = spider_list[i] if spider_end_count == 0: spider_start_count = loop_start_count else: spider_start_count = spider_end_count log.info( f'第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,' f'当前 ip 共 {spider_start_count} 个') spider = spider_list[i] try: yield runner.crawl(spider) except SystemExit: pass sleep_time = 10 spider_end_count = proxy_manager.count() spider_crawled_count = spider_end_count - spider_start_count loop_crawled_count = spider_end_count - loop_start_count # 单次循环爬取到的数量 all_loop_proxy_count += loop_crawled_count divider = '-' * 10 log.info( f'{divider}第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,' f'共爬取到 {spider_crawled_count}/{loop_crawled_count}/{all_loop_proxy_count} 个代理{divider}' ) log.info(f'等待执行下一爬虫,sleep {sleep_time}') log.info(f'当前有效代理共 {proxy_manager.available_count()} 个') time.sleep(sleep_time) # 结束时的数量 loop_end_count = proxy_manager.count() # 延时下一轮 sleep_time = 60 log.info( f'本轮共爬到 {loop_end_count-loop_start_count}/{loop_end_count} 个代理,等待下一轮,sleep {sleep_time}' ) log.info(f'当前有效代理共 {proxy_manager.available_count()} 个') time.sleep(sleep_time) reactor.stop()
def run(self, args, opts): for module in walk_modules('kingfisher_scrapy.spiders'): for cls in iter_spider_classes(module): Checker(module, cls).check()
def _load_spiders(self, module): for spcls in iter_spider_classes(module): self._found[spcls.name].append((module.__name__, spcls.__name__)) self._spiders[spcls.name] = spcls
def _load_spider(self, spider_path, spider_name=''): for spider_cls in iter_spider_classes(spider_path): if spider_name and spider_cls.name == spider_name: return spider_cls else: return spider_cls # return the first spider module
def _load_spiders(self, module): for spcls in iter_spider_classes(module): self._spiders[spcls.name] = spcls
def run_spider(spider_id): spider = SpiderModel.objects.get(id=spider_id) execution = Execution.objects.create(spider_id=spider_id, time_started=now()) user_settings = getattr(settings, 'SCRATCHY_SPIDERS', {}) item_storage = tempfile.NamedTemporaryFile(delete=False) default_settings = { 'DNS_TIMEOUT': 5, 'DOWNLOAD_TIMEOUT': 5, 'AUTOTHROTTLE_ENABLED': True, 'AUTOTHROTTLE_START_DELAY': 1, 'AUTOTHROTTLE_MAX_DELAY': 5, 'AUTOTHROTTLE_TARGET_CONCURRENCY': 1.0, } internal_settings = { 'FEED_FORMAT': 'jsonlines', 'FEED_EXPORT_ENCODING': 'utf-8', 'FEED_URI': item_storage.name, } scrapy_settings = { **default_settings, **user_settings, **spider.settings, **internal_settings, # last because these must not be overwritten } scrapy_spider_cls = None module = importlib.import_module(spider.module) for cls in iter_spider_classes(module): scrapy_spider_cls = cls break # use first valid class in module if scrapy_spider_cls is None: raise RuntimeError(f'No valid spider class found in module {module}') process = CrawlerProcess(settings=None, install_root_handler=False) log_capture_string = StringIO() log_handler = logging.StreamHandler(log_capture_string) log_handler.setLevel(spider.log_level) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') log_handler.setFormatter(formatter) scrapy_logger = logging.getLogger('scrapy') scrapy_logger.addHandler(log_handler) spider_logger = logging.getLogger( scrapy_spider_cls.name) # scrapy uses the spider name as logger id spider_logger.addHandler(log_handler) crawler = Crawler(scrapy_spider_cls, scrapy_settings) process.crawl(crawler) process.start() # blocks here log_contents = log_capture_string.getvalue() spider_logger.removeHandler(log_handler) scrapy_logger.removeHandler(log_handler) log_capture_string.close() execution.time_ended = now() execution.stats = crawler.stats._stats execution.log = log_contents execution.save() item_storage.seek(0) items = [] for line in item_storage.readlines(): item_data = json.loads(line) item = Item(spider_id=spider_id, execution=execution, data=item_data) items.append(item) Item.objects.bulk_create(items, batch_size=100) item_storage.close() os.remove(item_storage.name)
def _load_spiders(self, module): # use the built-in function for loading spiders for spcls in iter_spider_classes(module): self._spiders[spcls.name] = spcls
def test_iter_spider_classes(self): import tests.test_utils_spider it = iter_spider_classes(tests.test_utils_spider) self.assertEqual(set(it), {MySpider1, MySpider2})
def _load_spiders(self, module): for spcls in iter_spider_classes(module): self._found[spcls.name].append( (module.__name__, spcls.__name__)) # 模块的__name__就是模块名 self._spiders[spcls.name] = spcls # 爬虫类的name属性,注意不是实例的name
def _load_spiders(self, module): for spcls in iter_spider_classes( module): # 这个类可以判定给定模组内所有的方法里是否有spider的子类 self._found[spcls.name].append((module.__name__, spcls.__name__)) self._spiders[spcls.name] = spcls
def get_spiders_iter(): for name in settings.get('SPIDER_MODULES'): for module in walk_modules(name): for spcls in iter_spider_classes(module): yield spcls
def iter_spider_from_module(modules): """返回包含指定模块下的所有 spider 类的生成器""" for m in walk_modules(modules): yield from iter_spider_classes(m)
from sqlalchemy.sql import text HERE = os.path.dirname(os.path.abspath(__file__)) product_spiders_root = os.path.dirname(HERE) project_root = os.path.dirname(product_spiders_root) sys.path.append(project_root) sys.path.append(os.path.join(project_root, 'product_spiders')) from product_spiders.db import Session from scrapy.utils.misc import walk_modules from scrapy.utils.spider import iter_spider_classes from productspidersweb.models import Spider print sys.path here = os.path.abspath(os.path.dirname(__file__)) db_session = Session() spider_modules = ['product_spiders.spiders'] for name in spider_modules: for module in walk_modules(name): for spider in iter_spider_classes(module): sp = db_session.query(Spider).filter( Spider.name == spider.name).first() if sp: sp.module = str(spider.__module__) db_session.add(sp) db_session.commit()
def all_true_case(self, module1): self._result = list(iter_spider_classes(module1))
def test_iter_spider_classes(self): import tests.test_utils_spider it = iter_spider_classes(tests.test_utils_spider) self.assertEqual(set(it), set([MySpider1, MySpider2]))
def d_false_others_true_case(self, module4): self._result = list(iter_spider_classes(module4))