Ejemplo n.º 1
0
def _iter_command_classes(module_name):
    # TODO: add `name` attribute to commands and and merge this function with
    # scrapy.utils.spider.iter_spider_classes
    for module in walk_modules(module_name):



####
#   The changes to the following block is targeted at making scrapy available
#   in both Python 2.7 and Python 3.x . The original code is commented out.


#        for obj in vars(module).itervalues():
#            if inspect.isclass(obj) and \
#               issubclass(obj, ScrapyCommand) and \
#               obj.__module__ == module.__name__:
#                yield obj

        try:
            for obj in vars(module).itervalues():
                if inspect.isclass(obj) and \
                        issubclass(obj, ScrapyCommand) and \
                        obj.__module__ == module.__name__:
                    yield obj
        except AttributeError:
            for obj in vars(module).values():
                if inspect.isclass(obj) and \
                        issubclass(obj, ScrapyCommand) and \
                        obj.__module__ == module.__name__:
                    yield obj
Ejemplo n.º 2
0
 def __init__(self, settings):
     self.spider_modules = settings.getlist('SPIDER_MODULES')
     self._spiders = {}
     for name in self.spider_modules:
         # 取得SPIDER_MODULES及所有子模块
         for module in walk_modules(name):
             self._load_spiders(module)
Ejemplo n.º 3
0
 def __init__(self, spider_modules):
     self.spider_modules = spider_modules
     self._spiders = {}
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
     dispatcher.connect(self.close_spider, signals.spider_closed)
Ejemplo n.º 4
0
def _iter_command_classes(module_name):
    # TODO: add `name` attribute to commands and and merge this function with
    # scrapy.utils.spider.iter_spider_classes
    for module in walk_modules(module_name):
        for obj in vars(module).values():
            if inspect.isclass(obj) and issubclass(obj, ScrapyCommand) and obj.__module__ == module.__name__:
                yield obj
Ejemplo n.º 5
0
 def __init__(self, settings):
     super(SpiderManager, self).__init__(settings)
     self.modules = self.spider_modules.to_value()
     self._spiders = {}
     for name in self.modules:
         for module in walk_modules(name):
             self._load_spiders(module)
     dispatcher.connect(self.close_spider, signals.spider_closed)
Ejemplo n.º 6
0
def get_spider_class(spider_name, project_settings):
    spider_modules = project_settings.get('SPIDER_MODULES')
    for spider_module in spider_modules:
        modules = walk_modules(spider_module)
        for module in islice(modules, 1, None):
            for spider_class in iter_spider_classes(module):
                if spider_class.name == spider_name:
                    return spider_class
    return None
Ejemplo n.º 7
0
def _iter_command_classes(module_name):
    # TODO: add `name` attribute to commands and and merge this function with
    # scrapy.utils.spider.iter_spider_classes
    for module in walk_modules(module_name):
        for obj in vars(module).itervalues():
            if inspect.isclass(obj) and \
               issubclass(obj, ScrapyCommand) and \
               obj.__module__ == module.__name__:
                yield obj
Ejemplo n.º 8
0
 def _load_all_spiders(self):
     for name in self.spider_modules:
         try:
             for module in walk_modules(name):
                 self._load_spiders(module)
         except ImportError as e:
             msg = ("\n{tb}Could not load spiders from module '{modname}'. "
                    "Check SPIDER_MODULES setting".format(
                        modname=name, tb=traceback.format_exc()))
             warnings.warn(msg, RuntimeWarning)
Ejemplo n.º 9
0
 def _load_all_spiders(self):
     for name in self.spider_modules:
         try:
             for module in walk_modules(name):
                 self._load_spiders(module)
         except ImportError as e:
             msg = ("\n{tb}Could not load spiders from module '{modname}'. "
                    "Check SPIDER_MODULES setting".format(
                         modname=name, tb=traceback.format_exc()))
             warnings.warn(msg, RuntimeWarning)
    def list(self):
        """
        Return a list with the names of all spiders available in the project.
        """
        if not self._spiders:
            for name in self.spider_modules:
                for module in walk_modules(name):
                    self._load_spiders(module)

        return list(self._spiders.keys())
Ejemplo n.º 11
0
def assert_good_spider_type(settings, spider_name):
	spider_modules = settings['SPIDER_MODULES']
	if isinstance(spider_modules, basestring):
		spider_modules = [spider_modules]

	spider_modules = settings['SPIDER_MODULES']
	for spider_module in spider_modules:
		for module in walk_modules(spider_module):
			for spcls in iter_spider_classes(module):
				if spcls.name == spider_name:
					if not issubclass(spcls, ForumSpider):
						raise Exception('Spider %s is not a Forum Spider. Please use the right script for your spider.' % spider_name)
 def test_walk_modules_egg(self):
     egg = os.path.join(os.path.dirname(__file__), 'test.egg')
     sys.path.append(egg)
     try:
         mods = walk_modules('testegg')
         expected = [
             'testegg.spiders', 'testegg.spiders.a', 'testegg.spiders.b',
             'testegg'
         ]
         self.assertEqual(set([m.__name__ for m in mods]), set(expected))
     finally:
         sys.path.remove(egg)
Ejemplo n.º 13
0
 def load_spiders(self):
     
     sys.path.append(self.source_path)
     try: 
             #sm = import_module("csdn.settings")
         self.spider_modules = load_object(self.project_setting_module +".SPIDER_MODULES")
       
         for name in self.spider_modules:
             for module in walk_modules(name):
                 self._load_spiders(module)
     finally:
         del sys.path[-1]
Ejemplo n.º 14
0
def find_spider_cls(spider_name, spider_packages):
    """
    Find spider class which name is equal to `spider_name` argument

    :param spider_name: spider name to look for
    :param spider_packages: a list of package names that will be searched for
        spider classes
    """
    for package_name in spider_packages:
        for module in walk_modules(package_name):
            for spider_cls in iter_spider_classes(module):
                if spider_cls.name == spider_name:
                    return spider_cls
Ejemplo n.º 15
0
def find_spider_cls(spider_name, spider_packages):
    """
    Find spider class which name is equal to `spider_name` argument

    :param spider_name: spider name to look for
    :param spider_packages: a list of package names that will be searched for
        spider classes
    """
    for package_name in spider_packages:
        for module in walk_modules(package_name):
            for spider_cls in iter_spider_classes(module):
                if spider_cls.name == spider_name:
                    return spider_cls
Ejemplo n.º 16
0
 def _load_all_spiders(self):
     for name in self.spider_modules:
         try:
             for module in walk_modules(name):
                 self._load_spiders(module)
         except ImportError as e:
             if self.warn_only:
                 msg = ("\n{tb}Could not load spiders from module '{modname}'. "
                        "See above traceback for details.".format(
                             modname=name, tb=traceback.format_exc()))
                 warnings.warn(msg, RuntimeWarning)
             else:
                 raise
     self._check_name_duplicates()
Ejemplo n.º 17
0
    def run(self, args, opts):
        basedir = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        def _keyfunc(module):
            module_name = module.__name__.rsplit('.', 1)[-1]
            if module_name.startswith(('costa_rica', 'dominican_republic')):
                return '_'.join(module_name.split('_', 2)[:2])
            return module_name.split('_', 1)[0]

        with open(os.path.join(basedir, 'docs', 'spiders.rst')) as f:
            lines = []
            for line in f:
                lines.append(line)
                if line.startswith('.. Do not edit past this line.'):
                    break

        with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f:
            for line in lines:
                f.write(line)

            for key, group in groupby(
                    walk_modules('kingfisher_scrapy.spiders'), _keyfunc):
                if key in ('spiders', 'fail'):
                    continue

                f.write(
                    f"\n{key.replace('_', ' ').title()}\n{'-' * len(key)}\n")

                for module in group:
                    for cls in iter_spider_classes(module):
                        f.write(
                            f'\n.. autoclass:: {module.__name__}.{cls.__name__}\n   :no-members:\n'
                        )

                        infix = ''
                        if cls.__doc__:
                            section = re.search(
                                r'^Environment variables\n(.+?)(?:^\S|\Z)',
                                dedent(cls.__doc__), re.MULTILINE | re.DOTALL)
                            if section:
                                environment_variables = re.findall(
                                    r'^(\S.+)\n  ', dedent(section[1]),
                                    re.MULTILINE)
                                infix = f"env {' '.join([f'{variable}=...' for variable in environment_variables])} "

                        f.write('\n.. code-block:: bash\n')
                        f.write(
                            f"\n   {infix}scrapy crawl {module.__name__.rsplit('.')[-1]}\n"
                        )
Ejemplo n.º 18
0
def enumerate_spider_classes():
    original_cd = os.getcwd()
    imported_settings = sys.modules.pop("settings", None)
    for spider_project in SpiderProject.objects.all():
        os.chdir(spider_project.path)
        os.environ.pop(ENVVAR, None)  # force get_project_settings() to reconsider the current directory
        project_settings = get_project_settings()
        for module_or_package_name in project_settings.get("SPIDER_MODULES"):
            for module in walk_modules(module_or_package_name):
                for spider_cls in iter_spider_classes(module):
                    yield (spider_project, spider_cls)
    if imported_settings is not None:
        sys.modules["settings"] = imported_settings
    os.chdir(original_cd)
Ejemplo n.º 19
0
 def _load_all_spiders(self):
     for name in self.spider_modules:
         try:
             for module in walk_modules(name):
                 self._load_spiders(module)
         except ImportError as e:
             if self.warn_only:
                 msg = ("\n{tb}Could not load spiders from module '{modname}'. "
                        "See above traceback for details.".format(
                             modname=name, tb=traceback.format_exc()))
                 warnings.warn(msg, RuntimeWarning)
             else:
                 raise
     self._check_name_duplicates()
Ejemplo n.º 20
0
 def test_walk_modules_egg(self):
     egg = os.path.join(os.path.dirname(__file__), 'test.egg')
     sys.path.append(egg)
     try:
         mods = walk_modules('testegg')
         expected = [
             'testegg.spiders',
             'testegg.spiders.a',
             'testegg.spiders.b',
             'testegg'
         ]
         self.assertEquals(set([m.__name__ for m in mods]), set(expected))
     finally:
         sys.path.remove(egg)
Ejemplo n.º 21
0
    def test_walk_modules(self):
        mods = walk_modules('scrapy.tests.test_utils_misc.test_walk_modules')
        expected = [
            'scrapy.tests.test_utils_misc.test_walk_modules',
            'scrapy.tests.test_utils_misc.test_walk_modules.mod',
            'scrapy.tests.test_utils_misc.test_walk_modules.mod.mod0',
            'scrapy.tests.test_utils_misc.test_walk_modules.mod1',
        ]
        self.assertEquals(set([m.__name__ for m in mods]), set(expected))

        mods = walk_modules('scrapy.tests.test_utils_misc.test_walk_modules.mod')
        expected = [
            'scrapy.tests.test_utils_misc.test_walk_modules.mod',
            'scrapy.tests.test_utils_misc.test_walk_modules.mod.mod0',
        ]
        self.assertEquals(set([m.__name__ for m in mods]), set(expected))

        mods = walk_modules('scrapy.tests.test_utils_misc.test_walk_modules.mod1')
        expected = [
            'scrapy.tests.test_utils_misc.test_walk_modules.mod1',
        ]
        self.assertEquals(set([m.__name__ for m in mods]), set(expected))

        self.assertRaises(ImportError, walk_modules, 'nomodule999')
Ejemplo n.º 22
0
 def load(self, spider_name):
     if os.path.exists(spider_name):
         spider_path = self._get_spider_path(spider_name)
         spider_module = import_module(spider_path)
         spider = self._load_spider(spider_module, spider_name)
         if spider:
             return spider
     else:
         for name in self.spider_modules:
             for module in walk_modules(name):
                 spider = self._load_spider(module, spider_name)
                 if spider:
                     return spider
         raise RuntimeError(f"Spider not found: {spider_name}")
     raise RuntimeError(f"{spider_module} hasn't Spider Module")
Ejemplo n.º 23
0
def _iter_command_classes(module_name):
    # TODO: add `name` attribute to commands and and merge this function with
    # scrapy.utils.spider.iter_spider_classes
    ## 迭代这个包下的所有模块,找到 ScrapyCommand 类的子类 Command
    ## 这个过程主要是,导入 commands 包下的所有模块,生成 { cmd_name: cmd_instance, ... }
    ## 字典集合,如果用户在配置文件中配置了自定义的命令类,也追加进去。也就是说,
    ## 自己也可以编写自己的命令类,然后追加到配置文件中,之后就可以使用自己自定义
    ## 的命令了
    for module in walk_modules(module_name):
        for obj in vars(module).values():
            if inspect.isclass(obj) and \
                    issubclass(obj, ScrapyCommand) and \
                    obj.__module__ == module.__name__ and \
                    not obj == ScrapyCommand:
                yield obj
    def test_walk_modules(self):
        mods = walk_modules('tests.test_utils_misc.test_walk_modules')
        expected = [
            'tests.test_utils_misc.test_walk_modules',
            'tests.test_utils_misc.test_walk_modules.mod',
            'tests.test_utils_misc.test_walk_modules.mod.mod0',
            'tests.test_utils_misc.test_walk_modules.mod1',
        ]
        self.assertEqual(set([m.__name__ for m in mods]), set(expected))

        mods = walk_modules('tests.test_utils_misc.test_walk_modules.mod')
        expected = [
            'tests.test_utils_misc.test_walk_modules.mod',
            'tests.test_utils_misc.test_walk_modules.mod.mod0',
        ]
        self.assertEqual(set([m.__name__ for m in mods]), set(expected))

        mods = walk_modules('tests.test_utils_misc.test_walk_modules.mod1')
        expected = [
            'tests.test_utils_misc.test_walk_modules.mod1',
        ]
        self.assertEqual(set([m.__name__ for m in mods]), set(expected))

        self.assertRaises(ImportError, walk_modules, 'nomodule999')
Ejemplo n.º 25
0
 def _load_all_items(self, item_modules=None):
     if item_modules:
         self.item_modules.append(item_modules)
     for name in self.item_modules:
         try:
             for module in walk_modules(name):
                 self._load_items(module)
         except ImportError as e:
             if self.warn_only:
                 msg = ("\n{tb}Could not load spiders from module '{modname}'. "
                        "See above traceback for details.".format(modname=name, tb=traceback.format_exc()))
                 warnings.warn(msg, RuntimeWarning)
             else:
                 raise
     self._loaded = True
Ejemplo n.º 26
0
 def _load_all_spiders(self):
     # TODO: Combine common code with other loaders into a common class
     for name in self.spider_modules:
         try:
             for module in walk_modules(name):
                 self._load_spiders(module)
         except ImportError as e:
             if self.warn_only:
                 msg = (
                     "\n{tb}Could not load spiders from module '{modname}'. "
                     "See above traceback for details.".format(
                         modname=name, tb=traceback.format_exc()))
                 warnings.warn(msg, RuntimeWarning)
             else:
                 raise
Ejemplo n.º 27
0
 def _load_all_spiders(self):
     for name in self.spider_modules:
         try:
             for module in walk_modules(name):
                 self._load_spiders(module)
         except ImportError:
             if self.warn_only:
                 warnings.warn(
                     f"\n{traceback.format_exc()}Could not load spiders "
                     f"from module '{name}'. "
                     "See above traceback for details.",
                     category=RuntimeWarning,
                 )
             else:
                 raise
     self._check_name_duplicates()
Ejemplo n.º 28
0
def _iter_command_classes(module_name):
    # TODO: add `name` attribute to commands and and merge this function with
    # scrapy.utils.spider.iter_spider_classes
    # XX实现从模块名字到模块的映射XX
    """ 判断传入的模块名称 ,在它的属性中判断
    是个类而且如果这个子模块是 Scrapycommand 的子类话 就抛出这个子模块
    其中walk_modules 是将一个模块内的所有可用部分列出,包括子模块的
    vars(k)函数返回的是一个ke包含的所有属性及其方法的字典 key是名字 value是值"""
    for module in walk_modules(
            module_name):  #这里的module_name实际上是一个路径 类似于 json.decode
        for obj in vars(module).values():  #遍历 这个模块里面所有的属性 和方法
            if (inspect.isclass(obj)  #是类
                    and issubclass(obj, ScrapyCommand)  #是目标子类
                    and obj.__module__
                    == module.__name__  # 他的名字 和它从属的类模块是相同的 (判断是模块级别)
                    and not obj == ScrapyCommand  #是继承后的父类
                ):
                yield obj
Ejemplo n.º 29
0
def find_spiders(type=None):
    """Find all classes that subclass scrapy.Spider

    If type is given then the output is filtered by type.
    Possible type values ['vk', 'site'].
    """

    spider_map = {}

    def _get_spiders(spiders, spider_map):
        """Returns a list of all spiders with unique name found in a module

        If 2 spiders with the same name are found, that subclass one another,
        then the child one is taken (based on mro)
        """
        # if two spiders with the same name are found, then take the one that
        # subclasses the autogenerated
        for s in spiders:
            if s.name in spider_map:
                # leave only the one that subclasses parent
                old = spider_map[s.name]
                if old in s.mro():
                    spider_map[s.name] = s
            else:
                spider_map[s.name] = s
        # the same one as passed with new values
        return spider_map

    for module in misc.walk_modules(settings.NEWSPIDER_MODULE):
        # crawl responsibly
        spiders = [s for s in spider.iter_spider_classes(module)
                   if s.type == type and type or not type]
        _get_spiders(spiders, spider_map)
    # add user generated modules
    user_spiders = autogenerate.load_spiders_from_json(
        settings.USER_SPIDERS_FILE)
    _get_spiders(user_spiders, spider_map)
    # check for name uniqueness
    return spider_map.values()
    def load(self, spider_name):
        try:
            spider = load_spider_db_data(spider_name)
            if spider.module:
                mod = importlib.import_module(spider.module)
                log_msg('Found module path on cache')
                self._load_spider(mod, spider_name)
            else:
                for name in self.spider_modules:
                    for module in walk_modules(name):
                        self._load_spider(module, spider_name)

            return super(CustomCrawlMethodSpiderManager,
                         self).load(spider_name)
        except KeyError:
            for cls_name, get_cls_func in special_spiders_cls_getters.items():
                spcls = get_cls_func(spider_name)
                if spcls is not None:
                    log_msg("Got spider for class %s: %s" %
                            (cls_name, spider_name))
                    return spcls
            raise
Ejemplo n.º 31
0
def iter_spider_from_module(modules):
    """返回包含指定模块下的所有 spider 类的生成器"""
    for m in walk_modules(modules):
        yield from iter_spider_classes(m)
Ejemplo n.º 32
0
 def run(self, args, opts):
     for module in walk_modules('kingfisher_scrapy.spiders'):
         for cls in iter_spider_classes(module):
             Checker(module, cls).check()
Ejemplo n.º 33
0
def iterate_all_newsspiders():
    modules = walk_modules(__name__)[1:]
    for module in modules:
        for website in util.iterate_subclasses_in_module(module, NewsSpider):
            yield website
Ejemplo n.º 34
0
 def __init__(self, spider_modules):
     self.spider_modules = spider_modules
     self._spiders = {}
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
Ejemplo n.º 35
0
 def __init__(self, settings):
     self.spider_modules = settings.getlist('SPIDER_MODULES')
     self._spiders = {}
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
Ejemplo n.º 36
0
 def __init__(self, settings):
     self.spider_modules = settings['SPIDER_MODULES']
     self._spiders = {}
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
Ejemplo n.º 37
0
 def _load_all_spiders(self):
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
Ejemplo n.º 38
0
 def __init__(self, spider_modules):
     self.spider_modules = spider_modules
     self._spiders = {}
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
Ejemplo n.º 39
0
 def _load_all_spiders(self):
     for name in self.spider_modules:
         for module in walk_modules(name):
             self._load_spiders(module)
Ejemplo n.º 40
0
from sqlalchemy.sql import text

HERE = os.path.dirname(os.path.abspath(__file__))
product_spiders_root = os.path.dirname(HERE)
project_root = os.path.dirname(product_spiders_root)

sys.path.append(project_root)
sys.path.append(os.path.join(project_root, 'product_spiders'))

from product_spiders.db import Session
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
from productspidersweb.models import Spider

print sys.path
here = os.path.abspath(os.path.dirname(__file__))

db_session = Session()

spider_modules = ['product_spiders.spiders']

for name in spider_modules:
    for module in walk_modules(name):
        for spider in iter_spider_classes(module):
            sp = db_session.query(Spider).filter(
                Spider.name == spider.name).first()
            if sp:
                sp.module = str(spider.__module__)
                db_session.add(sp)

db_session.commit()
Ejemplo n.º 41
0
 def __init__(self, domain):
     self.base = settings.get('TEMPLATE_MODULES', 'yowa.templates')
     self.template_module = '.'.join((self.base, domain))
     self._parsers = {}
     for module in walk_modules(self.template_module):
         self._filter_parsers(module)
Ejemplo n.º 42
0
def get_spiders_iter():
    for name in settings.get('SPIDER_MODULES'):
        for module in walk_modules(name):
            for spcls in iter_spider_classes(module):
                yield spcls