def _iter_command_classes(module_name): # TODO: add `name` attribute to commands and and merge this function with # scrapy.utils.spider.iter_spider_classes for module in walk_modules(module_name): #### # The changes to the following block is targeted at making scrapy available # in both Python 2.7 and Python 3.x . The original code is commented out. # for obj in vars(module).itervalues(): # if inspect.isclass(obj) and \ # issubclass(obj, ScrapyCommand) and \ # obj.__module__ == module.__name__: # yield obj try: for obj in vars(module).itervalues(): if inspect.isclass(obj) and \ issubclass(obj, ScrapyCommand) and \ obj.__module__ == module.__name__: yield obj except AttributeError: for obj in vars(module).values(): if inspect.isclass(obj) and \ issubclass(obj, ScrapyCommand) and \ obj.__module__ == module.__name__: yield obj
def __init__(self, settings): self.spider_modules = settings.getlist('SPIDER_MODULES') self._spiders = {} for name in self.spider_modules: # 取得SPIDER_MODULES及所有子模块 for module in walk_modules(name): self._load_spiders(module)
def __init__(self, spider_modules): self.spider_modules = spider_modules self._spiders = {} for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module) dispatcher.connect(self.close_spider, signals.spider_closed)
def _iter_command_classes(module_name): # TODO: add `name` attribute to commands and and merge this function with # scrapy.utils.spider.iter_spider_classes for module in walk_modules(module_name): for obj in vars(module).values(): if inspect.isclass(obj) and issubclass(obj, ScrapyCommand) and obj.__module__ == module.__name__: yield obj
def __init__(self, settings): super(SpiderManager, self).__init__(settings) self.modules = self.spider_modules.to_value() self._spiders = {} for name in self.modules: for module in walk_modules(name): self._load_spiders(module) dispatcher.connect(self.close_spider, signals.spider_closed)
def get_spider_class(spider_name, project_settings): spider_modules = project_settings.get('SPIDER_MODULES') for spider_module in spider_modules: modules = walk_modules(spider_module) for module in islice(modules, 1, None): for spider_class in iter_spider_classes(module): if spider_class.name == spider_name: return spider_class return None
def _iter_command_classes(module_name): # TODO: add `name` attribute to commands and and merge this function with # scrapy.utils.spider.iter_spider_classes for module in walk_modules(module_name): for obj in vars(module).itervalues(): if inspect.isclass(obj) and \ issubclass(obj, ScrapyCommand) and \ obj.__module__ == module.__name__: yield obj
def _load_all_spiders(self): for name in self.spider_modules: try: for module in walk_modules(name): self._load_spiders(module) except ImportError as e: msg = ("\n{tb}Could not load spiders from module '{modname}'. " "Check SPIDER_MODULES setting".format( modname=name, tb=traceback.format_exc())) warnings.warn(msg, RuntimeWarning)
def list(self): """ Return a list with the names of all spiders available in the project. """ if not self._spiders: for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module) return list(self._spiders.keys())
def assert_good_spider_type(settings, spider_name): spider_modules = settings['SPIDER_MODULES'] if isinstance(spider_modules, basestring): spider_modules = [spider_modules] spider_modules = settings['SPIDER_MODULES'] for spider_module in spider_modules: for module in walk_modules(spider_module): for spcls in iter_spider_classes(module): if spcls.name == spider_name: if not issubclass(spcls, ForumSpider): raise Exception('Spider %s is not a Forum Spider. Please use the right script for your spider.' % spider_name)
def test_walk_modules_egg(self): egg = os.path.join(os.path.dirname(__file__), 'test.egg') sys.path.append(egg) try: mods = walk_modules('testegg') expected = [ 'testegg.spiders', 'testegg.spiders.a', 'testegg.spiders.b', 'testegg' ] self.assertEqual(set([m.__name__ for m in mods]), set(expected)) finally: sys.path.remove(egg)
def load_spiders(self): sys.path.append(self.source_path) try: #sm = import_module("csdn.settings") self.spider_modules = load_object(self.project_setting_module +".SPIDER_MODULES") for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module) finally: del sys.path[-1]
def find_spider_cls(spider_name, spider_packages): """ Find spider class which name is equal to `spider_name` argument :param spider_name: spider name to look for :param spider_packages: a list of package names that will be searched for spider classes """ for package_name in spider_packages: for module in walk_modules(package_name): for spider_cls in iter_spider_classes(module): if spider_cls.name == spider_name: return spider_cls
def _load_all_spiders(self): for name in self.spider_modules: try: for module in walk_modules(name): self._load_spiders(module) except ImportError as e: if self.warn_only: msg = ("\n{tb}Could not load spiders from module '{modname}'. " "See above traceback for details.".format( modname=name, tb=traceback.format_exc())) warnings.warn(msg, RuntimeWarning) else: raise self._check_name_duplicates()
def run(self, args, opts): basedir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def _keyfunc(module): module_name = module.__name__.rsplit('.', 1)[-1] if module_name.startswith(('costa_rica', 'dominican_republic')): return '_'.join(module_name.split('_', 2)[:2]) return module_name.split('_', 1)[0] with open(os.path.join(basedir, 'docs', 'spiders.rst')) as f: lines = [] for line in f: lines.append(line) if line.startswith('.. Do not edit past this line.'): break with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f: for line in lines: f.write(line) for key, group in groupby( walk_modules('kingfisher_scrapy.spiders'), _keyfunc): if key in ('spiders', 'fail'): continue f.write( f"\n{key.replace('_', ' ').title()}\n{'-' * len(key)}\n") for module in group: for cls in iter_spider_classes(module): f.write( f'\n.. autoclass:: {module.__name__}.{cls.__name__}\n :no-members:\n' ) infix = '' if cls.__doc__: section = re.search( r'^Environment variables\n(.+?)(?:^\S|\Z)', dedent(cls.__doc__), re.MULTILINE | re.DOTALL) if section: environment_variables = re.findall( r'^(\S.+)\n ', dedent(section[1]), re.MULTILINE) infix = f"env {' '.join([f'{variable}=...' for variable in environment_variables])} " f.write('\n.. code-block:: bash\n') f.write( f"\n {infix}scrapy crawl {module.__name__.rsplit('.')[-1]}\n" )
def enumerate_spider_classes(): original_cd = os.getcwd() imported_settings = sys.modules.pop("settings", None) for spider_project in SpiderProject.objects.all(): os.chdir(spider_project.path) os.environ.pop(ENVVAR, None) # force get_project_settings() to reconsider the current directory project_settings = get_project_settings() for module_or_package_name in project_settings.get("SPIDER_MODULES"): for module in walk_modules(module_or_package_name): for spider_cls in iter_spider_classes(module): yield (spider_project, spider_cls) if imported_settings is not None: sys.modules["settings"] = imported_settings os.chdir(original_cd)
def test_walk_modules_egg(self): egg = os.path.join(os.path.dirname(__file__), 'test.egg') sys.path.append(egg) try: mods = walk_modules('testegg') expected = [ 'testegg.spiders', 'testegg.spiders.a', 'testegg.spiders.b', 'testegg' ] self.assertEquals(set([m.__name__ for m in mods]), set(expected)) finally: sys.path.remove(egg)
def test_walk_modules(self): mods = walk_modules('scrapy.tests.test_utils_misc.test_walk_modules') expected = [ 'scrapy.tests.test_utils_misc.test_walk_modules', 'scrapy.tests.test_utils_misc.test_walk_modules.mod', 'scrapy.tests.test_utils_misc.test_walk_modules.mod.mod0', 'scrapy.tests.test_utils_misc.test_walk_modules.mod1', ] self.assertEquals(set([m.__name__ for m in mods]), set(expected)) mods = walk_modules('scrapy.tests.test_utils_misc.test_walk_modules.mod') expected = [ 'scrapy.tests.test_utils_misc.test_walk_modules.mod', 'scrapy.tests.test_utils_misc.test_walk_modules.mod.mod0', ] self.assertEquals(set([m.__name__ for m in mods]), set(expected)) mods = walk_modules('scrapy.tests.test_utils_misc.test_walk_modules.mod1') expected = [ 'scrapy.tests.test_utils_misc.test_walk_modules.mod1', ] self.assertEquals(set([m.__name__ for m in mods]), set(expected)) self.assertRaises(ImportError, walk_modules, 'nomodule999')
def load(self, spider_name): if os.path.exists(spider_name): spider_path = self._get_spider_path(spider_name) spider_module = import_module(spider_path) spider = self._load_spider(spider_module, spider_name) if spider: return spider else: for name in self.spider_modules: for module in walk_modules(name): spider = self._load_spider(module, spider_name) if spider: return spider raise RuntimeError(f"Spider not found: {spider_name}") raise RuntimeError(f"{spider_module} hasn't Spider Module")
def _iter_command_classes(module_name): # TODO: add `name` attribute to commands and and merge this function with # scrapy.utils.spider.iter_spider_classes ## 迭代这个包下的所有模块,找到 ScrapyCommand 类的子类 Command ## 这个过程主要是,导入 commands 包下的所有模块,生成 { cmd_name: cmd_instance, ... } ## 字典集合,如果用户在配置文件中配置了自定义的命令类,也追加进去。也就是说, ## 自己也可以编写自己的命令类,然后追加到配置文件中,之后就可以使用自己自定义 ## 的命令了 for module in walk_modules(module_name): for obj in vars(module).values(): if inspect.isclass(obj) and \ issubclass(obj, ScrapyCommand) and \ obj.__module__ == module.__name__ and \ not obj == ScrapyCommand: yield obj
def test_walk_modules(self): mods = walk_modules('tests.test_utils_misc.test_walk_modules') expected = [ 'tests.test_utils_misc.test_walk_modules', 'tests.test_utils_misc.test_walk_modules.mod', 'tests.test_utils_misc.test_walk_modules.mod.mod0', 'tests.test_utils_misc.test_walk_modules.mod1', ] self.assertEqual(set([m.__name__ for m in mods]), set(expected)) mods = walk_modules('tests.test_utils_misc.test_walk_modules.mod') expected = [ 'tests.test_utils_misc.test_walk_modules.mod', 'tests.test_utils_misc.test_walk_modules.mod.mod0', ] self.assertEqual(set([m.__name__ for m in mods]), set(expected)) mods = walk_modules('tests.test_utils_misc.test_walk_modules.mod1') expected = [ 'tests.test_utils_misc.test_walk_modules.mod1', ] self.assertEqual(set([m.__name__ for m in mods]), set(expected)) self.assertRaises(ImportError, walk_modules, 'nomodule999')
def _load_all_items(self, item_modules=None): if item_modules: self.item_modules.append(item_modules) for name in self.item_modules: try: for module in walk_modules(name): self._load_items(module) except ImportError as e: if self.warn_only: msg = ("\n{tb}Could not load spiders from module '{modname}'. " "See above traceback for details.".format(modname=name, tb=traceback.format_exc())) warnings.warn(msg, RuntimeWarning) else: raise self._loaded = True
def _load_all_spiders(self): # TODO: Combine common code with other loaders into a common class for name in self.spider_modules: try: for module in walk_modules(name): self._load_spiders(module) except ImportError as e: if self.warn_only: msg = ( "\n{tb}Could not load spiders from module '{modname}'. " "See above traceback for details.".format( modname=name, tb=traceback.format_exc())) warnings.warn(msg, RuntimeWarning) else: raise
def _load_all_spiders(self): for name in self.spider_modules: try: for module in walk_modules(name): self._load_spiders(module) except ImportError: if self.warn_only: warnings.warn( f"\n{traceback.format_exc()}Could not load spiders " f"from module '{name}'. " "See above traceback for details.", category=RuntimeWarning, ) else: raise self._check_name_duplicates()
def _iter_command_classes(module_name): # TODO: add `name` attribute to commands and and merge this function with # scrapy.utils.spider.iter_spider_classes # XX实现从模块名字到模块的映射XX """ 判断传入的模块名称 ,在它的属性中判断 是个类而且如果这个子模块是 Scrapycommand 的子类话 就抛出这个子模块 其中walk_modules 是将一个模块内的所有可用部分列出,包括子模块的 vars(k)函数返回的是一个ke包含的所有属性及其方法的字典 key是名字 value是值""" for module in walk_modules( module_name): #这里的module_name实际上是一个路径 类似于 json.decode for obj in vars(module).values(): #遍历 这个模块里面所有的属性 和方法 if (inspect.isclass(obj) #是类 and issubclass(obj, ScrapyCommand) #是目标子类 and obj.__module__ == module.__name__ # 他的名字 和它从属的类模块是相同的 (判断是模块级别) and not obj == ScrapyCommand #是继承后的父类 ): yield obj
def find_spiders(type=None): """Find all classes that subclass scrapy.Spider If type is given then the output is filtered by type. Possible type values ['vk', 'site']. """ spider_map = {} def _get_spiders(spiders, spider_map): """Returns a list of all spiders with unique name found in a module If 2 spiders with the same name are found, that subclass one another, then the child one is taken (based on mro) """ # if two spiders with the same name are found, then take the one that # subclasses the autogenerated for s in spiders: if s.name in spider_map: # leave only the one that subclasses parent old = spider_map[s.name] if old in s.mro(): spider_map[s.name] = s else: spider_map[s.name] = s # the same one as passed with new values return spider_map for module in misc.walk_modules(settings.NEWSPIDER_MODULE): # crawl responsibly spiders = [s for s in spider.iter_spider_classes(module) if s.type == type and type or not type] _get_spiders(spiders, spider_map) # add user generated modules user_spiders = autogenerate.load_spiders_from_json( settings.USER_SPIDERS_FILE) _get_spiders(user_spiders, spider_map) # check for name uniqueness return spider_map.values()
def load(self, spider_name): try: spider = load_spider_db_data(spider_name) if spider.module: mod = importlib.import_module(spider.module) log_msg('Found module path on cache') self._load_spider(mod, spider_name) else: for name in self.spider_modules: for module in walk_modules(name): self._load_spider(module, spider_name) return super(CustomCrawlMethodSpiderManager, self).load(spider_name) except KeyError: for cls_name, get_cls_func in special_spiders_cls_getters.items(): spcls = get_cls_func(spider_name) if spcls is not None: log_msg("Got spider for class %s: %s" % (cls_name, spider_name)) return spcls raise
def iter_spider_from_module(modules): """返回包含指定模块下的所有 spider 类的生成器""" for m in walk_modules(modules): yield from iter_spider_classes(m)
def run(self, args, opts): for module in walk_modules('kingfisher_scrapy.spiders'): for cls in iter_spider_classes(module): Checker(module, cls).check()
def iterate_all_newsspiders(): modules = walk_modules(__name__)[1:] for module in modules: for website in util.iterate_subclasses_in_module(module, NewsSpider): yield website
def __init__(self, spider_modules): self.spider_modules = spider_modules self._spiders = {} for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module)
def __init__(self, settings): self.spider_modules = settings.getlist('SPIDER_MODULES') self._spiders = {} for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module)
def __init__(self, settings): self.spider_modules = settings['SPIDER_MODULES'] self._spiders = {} for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module)
def _load_all_spiders(self): for name in self.spider_modules: for module in walk_modules(name): self._load_spiders(module)
from sqlalchemy.sql import text HERE = os.path.dirname(os.path.abspath(__file__)) product_spiders_root = os.path.dirname(HERE) project_root = os.path.dirname(product_spiders_root) sys.path.append(project_root) sys.path.append(os.path.join(project_root, 'product_spiders')) from product_spiders.db import Session from scrapy.utils.misc import walk_modules from scrapy.utils.spider import iter_spider_classes from productspidersweb.models import Spider print sys.path here = os.path.abspath(os.path.dirname(__file__)) db_session = Session() spider_modules = ['product_spiders.spiders'] for name in spider_modules: for module in walk_modules(name): for spider in iter_spider_classes(module): sp = db_session.query(Spider).filter( Spider.name == spider.name).first() if sp: sp.module = str(spider.__module__) db_session.add(sp) db_session.commit()
def __init__(self, domain): self.base = settings.get('TEMPLATE_MODULES', 'yowa.templates') self.template_module = '.'.join((self.base, domain)) self._parsers = {} for module in walk_modules(self.template_module): self._filter_parsers(module)
def get_spiders_iter(): for name in settings.get('SPIDER_MODULES'): for module in walk_modules(name): for spcls in iter_spider_classes(module): yield spcls