Ejemplo n.º 1
0
    def run(self, args, opts):
        basedir = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        def _keyfunc(module):
            return module.__name__.rsplit('.', 1)[-1].split('_', 1)[0]

        with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f:
            f.write(
                dedent("""\
            Spiders
            =======

            ..  Do not edit this file. Instead, run: `scrapy updatedocs`
            """))

            for key, group in groupby(
                    walk_modules('kingfisher_scrapy.spiders'), _keyfunc):
                if key in ('spiders', 'test'):
                    continue

                f.write('\n{}\n{}\n'.format(key.capitalize(), '-' * len(key)))

                for module in group:
                    for cls in iter_spider_classes(module):
                        f.write(
                            '\n.. autoclass:: {}.{}\n   :no-members:\n'.format(
                                module.__name__, cls.__name__))
Ejemplo n.º 2
0
    def __init__(self, settings):
        self.elastic = Elastic.from_settings(settings)

        # get spiders in /product_spider/spiders and /product_spider/datafeeds folder
        # and generate spider's redis_key into spiders_rediskey
        self.spiders_rediskey = {}
        spider_modules = settings.getlist('SPIDER_MODULES')
        for name in spider_modules:
            try:
                for module in walk_modules(name):
                    for spcls in iter_spider_classes(module):
                        if spcls.name and getattr(spcls, 'crawler_type', None) == 'spider' and \
                            hasattr(spcls, 'redis_key'):
                            self.spiders_rediskey[spcls.name] = spcls.redis_key
                            logger.info('search spider:%(name)s, key=%(key)s',
                                        {
                                            'name': spcls.name,
                                            'key': spcls.redis_key
                                        })
            except ImportError:
                pass
        self.req_session = requests.Session()
        self.group_items_api = settings.get(
            'SEARCH_API') + '/se/spider-update/group/'
        self.group_items_cache_ttl = 86400
Ejemplo n.º 3
0
class Command(ScrapyCommand):

    requires_project = False

    def syntax(self):
        return "[options] <spider_file>"

    def short_desc(self):
        return "Run a self-contained spider (without creating a project)"

    def long_desc(self):
        return "Run the spider defined in the given file"

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", \
            help="set spider argument (may be repeated)")
        parser.add_option("-o", "--output", metavar="FILE", \
            help="dump scraped items into FILE (use - for stdout)")
        parser.add_option("-t", "--output-format", metavar="FORMAT", default="jsonlines", \
            help="format to use for dumping items with -o (default: %default)")

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError("Invalid -a value, use -a NAME=VALUE",
                             print_help=False)
        if opts.output:
            if opts.output == '-':
                self.settings.overrides['FEED_URI'] = 'stdout:'
            else:
                self.settings.overrides['FEED_URI'] = opts.output
            valid_output_formats = self.settings['FEED_EXPORTERS'].keys(
            ) + self.settings['FEED_EXPORTERS_BASE'].keys()
            if opts.output_format not in valid_output_formats:
                raise UsageError(
                    'Invalid/unrecognized output format: %s, Expected %s' %
                    (opts.output_format, valid_output_formats))
            self.settings.overrides['FEED_FORMAT'] = opts.output_format

    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        filename = args[0]
        if not os.path.exists(filename):
            raise UsageError("File not found: %s\n" % filename)
        try:
            module = _import_file(filename)
        except (ImportError, ValueError), e:
            raise UsageError("Unable to load %r: %s\n" % (filename, e))
        spclasses = list(iter_spider_classes(module))
        if not spclasses:
            raise UsageError("No spider found in file: %s\n" % filename)
        spider = spclasses.pop()(**opts.spargs)

        crawler = self.crawler_process.create_crawler()
        crawler.crawl(spider)
        self.crawler_process.start()
Ejemplo n.º 4
0
class Command(ScrapyCommand):

    requires_project = False

    def syntax(self):
        return "[options] <spider_file>"

    def short_desc(self):
        return "Run a self-contained spider (without creating a project)"

    def long_desc(self):
        return "Run the spider defined in the given file"

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)

    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        filename = args[0]
        if not os.path.exists(filename):
            raise UsageError("File not found: %s\n" % filename)
        try:
            module = _import_file(filename)
        except (ImportError, ValueError), e:
            raise UsageError("Unable to load %r: %s\n" % (filename, e))
        spclasses = list(iter_spider_classes(module))
        if not spclasses:
            raise UsageError("No spider found in file: %s\n" % filename)
        spider = spclasses.pop()()
        # schedule spider and start engine
        self.crawler.queue.append_spider(spider)
        self.crawler.start()
Ejemplo n.º 5
0
def find_spider_cls(spider_name, spider_packages):
    """查找spider"""
    for package_name in spider_packages:
        for module in walk_modules(package_name):
            for spider_cls in iter_spider_classes(module):
                if spider_cls.name == spider_name:
                    return spider_cls
Ejemplo n.º 6
0
    def crawl_in_loop(self, runner):
        """在循环中爬取"""
        # 遍历取出 spider
        spider_list = []
        for spider_class in iter_spider_classes(regex_proxy_spider):
            ip_count = getattr(spider_class, 'ip_count', 0)
            if ip_count > 0:
                spider_list.append(spider_class)

        all_loop = ProxyCounter()
        single_loop = ProxyCounter()
        # 开始时起动,每轮结束后计数
        all_loop.start()
        # 无限循环
        loop_times = 0
        while loop_times >= 0:
            loop_times += 1

            # 每轮开始时启动,每个爬虫结束时计数
            single_loop.start()
            while single_loop.available.start_num > 100:
                print(f'有效 ip {single_loop.available.start_num} 个,休息 10 分钟')
                time.sleep(60 * 10)
                single_loop.start()

            # 开始时的数量
            log.info(f'第 {loop_times} 轮爬取开始')

            # 爬取
            for i in range(len(spider_list)):
                spider = spider_list[i]
                log.info(
                    f'第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,'
                    f'{single_loop.print_count()}')

                spider = spider_list[i]
                try:
                    yield runner.crawl(spider)
                except SystemExit:
                    pass
                sleep_time = 10
                divider = '-' * 10
                single_loop.count()
                log.info(
                    f'{divider}第 {loop_times} 轮,第 {i + 1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,'
                    f'{single_loop.print_count()} {divider}')
                log.info(f'等待执行下一爬虫,sleep {sleep_time}')
                time.sleep(sleep_time)

            # 延时下一轮
            sleep_time = 60
            log.info(f'本轮爬取结束,等待下一轮,sleep {sleep_time}')
            all_loop.count()
            log.info(all_loop.print_count())
            time.sleep(sleep_time)
        # noinspection PyUnresolvedReferences
        reactor.stop()
Ejemplo n.º 7
0
def get_spider_class(spider_name, project_settings):
    spider_modules = project_settings.get('SPIDER_MODULES')
    for spider_module in spider_modules:
        modules = walk_modules(spider_module)
        for module in islice(modules, 1, None):
            for spider_class in iter_spider_classes(module):
                if spider_class.name == spider_name:
                    return spider_class
    return None
Ejemplo n.º 8
0
 def _load_spiders(self, module):
     """
     加载模块中的蜘蛛
     inspect.isclass(obj) 是类
     issubclass(obj, Spider) 是Spider的子类
     obj.__module__ == module.__name__
     getattr(obj, 'name', None) name不为空
     """
     for spcls in iter_spider_classes(module):
         self._spiders[spcls.name] = spcls
def assert_good_spider_type(settings, spider_name):
	spider_modules = settings['SPIDER_MODULES']
	if isinstance(spider_modules, basestring):
		spider_modules = [spider_modules]

	spider_modules = settings['SPIDER_MODULES']
	for spider_module in spider_modules:
		for module in walk_modules(spider_module):
			for spcls in iter_spider_classes(module):
				if spcls.name == spider_name:
					if not issubclass(spcls, ForumSpider):
						raise Exception('Spider %s is not a Forum Spider. Please use the right script for your spider.' % spider_name)
Ejemplo n.º 10
0
def find_spider_cls(spider_name, spider_packages):
    """
    Find spider class which name is equal to `spider_name` argument

    :param spider_name: spider name to look for
    :param spider_packages: a list of package names that will be searched for
        spider classes
    """
    for package_name in spider_packages:
        for module in walk_modules(package_name):
            for spider_cls in iter_spider_classes(module):
                if spider_cls.name == spider_name:
                    return spider_cls
Ejemplo n.º 11
0
def find_spider_cls(spider_name, spider_packages):
    """
    Find spider class which name is equal to `spider_name` argument

    :param spider_name: spider name to look for
    :param spider_packages: a list of package names that will be searched for
        spider classes
    """
    for package_name in spider_packages:
        for module in walk_modules(package_name):
            for spider_cls in iter_spider_classes(module):
                if spider_cls.name == spider_name:
                    return spider_cls
Ejemplo n.º 12
0
 def run(self, args, opts):
     #settings = get_project_settings()
     #spider_loader = self.crawler_process.spider_loader
     crawler = self.crawler_process.create_crawler()
     names = crawler.spiders.list()
     spclasses = []
     for spidername in names:
         crawler = self.crawler_process.create_crawler()
         module = _import_file("./eastmoney/spiders/"+spidername+"_spider.py")
         #spclasses = spclasses+list(iter_spider_classes(module))
         spider = list(iter_spider_classes(module)).pop()(**opts.spargs)
         crawler.crawl(spider)
         self.crawler_process.start()
         crawler._spider = None
Ejemplo n.º 13
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        spidername = args[0]
        spider_module = __import__file(spidername)
        sp_classes = list(iter_spider_classes(spider_module))
        if not sp_classes:
            raise UsageError("No spider found : %s\n" % spidername)
        spidercls = sp_classes.pop()

        self.crawler_process.crawl(spidercls, **opts.__dict__)
        self.crawler_process.start()
        if self.crawler_process.bootstrap_failed:
            self.exitcode = 1
Ejemplo n.º 14
0
    def run(self, args, opts):
        basedir = os.path.dirname(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

        def _keyfunc(module):
            module_name = module.__name__.rsplit('.', 1)[-1]
            if module_name.startswith(('costa_rica', 'dominican_republic')):
                return '_'.join(module_name.split('_', 2)[:2])
            return module_name.split('_', 1)[0]

        with open(os.path.join(basedir, 'docs', 'spiders.rst')) as f:
            lines = []
            for line in f:
                lines.append(line)
                if line.startswith('.. Do not edit past this line.'):
                    break

        with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f:
            for line in lines:
                f.write(line)

            for key, group in groupby(
                    walk_modules('kingfisher_scrapy.spiders'), _keyfunc):
                if key in ('spiders', 'fail'):
                    continue

                f.write(
                    f"\n{key.replace('_', ' ').title()}\n{'-' * len(key)}\n")

                for module in group:
                    for cls in iter_spider_classes(module):
                        f.write(
                            f'\n.. autoclass:: {module.__name__}.{cls.__name__}\n   :no-members:\n'
                        )

                        infix = ''
                        if cls.__doc__:
                            section = re.search(
                                r'^Environment variables\n(.+?)(?:^\S|\Z)',
                                dedent(cls.__doc__), re.MULTILINE | re.DOTALL)
                            if section:
                                environment_variables = re.findall(
                                    r'^(\S.+)\n  ', dedent(section[1]),
                                    re.MULTILINE)
                                infix = f"env {' '.join([f'{variable}=...' for variable in environment_variables])} "

                        f.write('\n.. code-block:: bash\n')
                        f.write(
                            f"\n   {infix}scrapy crawl {module.__name__.rsplit('.')[-1]}\n"
                        )
Ejemplo n.º 15
0
def enumerate_spider_classes():
    original_cd = os.getcwd()
    imported_settings = sys.modules.pop("settings", None)
    for spider_project in SpiderProject.objects.all():
        os.chdir(spider_project.path)
        os.environ.pop(ENVVAR, None)  # force get_project_settings() to reconsider the current directory
        project_settings = get_project_settings()
        for module_or_package_name in project_settings.get("SPIDER_MODULES"):
            for module in walk_modules(module_or_package_name):
                for spider_cls in iter_spider_classes(module):
                    yield (spider_project, spider_cls)
    if imported_settings is not None:
        sys.modules["settings"] = imported_settings
    os.chdir(original_cd)
Ejemplo n.º 16
0
 def run(self, args, opts):
     #settings = get_project_settings()
     #spider_loader = self.crawler_process.spider_loader
     crawler = self.crawler_process.create_crawler()
     names = crawler.spiders.list()
     spclasses = []
     for spidername in names:
         crawler = self.crawler_process.create_crawler()
         module = _import_file("./eastmoney/spiders/" + spidername +
                               "_spider.py")
         #spclasses = spclasses+list(iter_spider_classes(module))
         spider = list(iter_spider_classes(module)).pop()(**opts.spargs)
         crawler.crawl(spider)
         self.crawler_process.start()
         crawler._spider = None
Ejemplo n.º 17
0
def run(args):
    print(args)
    if len(args) != 1:
        raise UsageError()
    scrapy.utils.log.configure_logging()
    spidername = args[0]
    spider_module = __import__file(spidername)
    sp_classes = list(iter_spider_classes(spider_module))
    if not sp_classes:
        raise UsageError("No spider found : %s\n" % spidername)
    spidercls = sp_classes.pop()

    settings = get_project_settings()
    process = CrawlerProcess(settings=settings)
    process.crawl(spidercls)
    process.start()
Ejemplo n.º 18
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        filename = args[0]
        if not os.path.exists(filename):
            raise UsageError("File not found: %s\n" % filename)
        try:
            module = _import_file(filename)
        except (ImportError, ValueError) as e:
            raise UsageError("Unable to load %r: %s\n" % (filename, e))
        spclasses = list(iter_spider_classes(module))
        if not spclasses:
            raise UsageError("No spider found in file: %s\n" % filename)
        spidercls = spclasses.pop()

        self.crawler_process.crawl(spidercls, **opts.spargs)
        self.crawler_process.start()
Ejemplo n.º 19
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        filename = args[0]
        if not os.path.exists(filename):
            raise UsageError("File not found: %s\n" % filename)
        try:
            module = _import_file(filename)
        except (ImportError, ValueError) as e:
            raise UsageError("Unable to load %r: %s\n" % (filename, e))
        spclasses = list(iter_spider_classes(module))
        if not spclasses:
            raise UsageError("No spider found in file: %s\n" % filename)
        spidercls = spclasses.pop()

        self.crawler_process.crawl(spidercls, **opts.spargs)
        self.crawler_process.start()
    def _load_spider(self, module, spider):
        for spcls in iter_spider_classes(module):
            spider_name = spcls.name
            if spider_name != spider:
                continue

            spmdl = self._get_spider_db_model(spcls.name, ['crawl_method2'])
            if spmdl and spmdl.crawl_method2 and spmdl.crawl_method2.crawl_method:
                new_spcls = self._configure_spider_class(spcls, spmdl)
                if new_spcls is not None and inspect.isclass(new_spcls) and (
                        issubclass(new_spcls, BaseSpider)
                        or issubclass(new_spcls, Spider)):
                    spcls = new_spcls
                    setattr(spcls, 'name', spider_name)

            self._spiders[spider_name] = spcls
            break
Ejemplo n.º 21
0
class Command(ScrapyCommand):

    requires_project = False

    def syntax(self):
        return "[options] <spider_file>"

    def short_desc(self):
        return "Run a self-contained spider (without creating a project)"

    def long_desc(self):
        return "Run the spider defined in the given file"

    def add_options(self, parser):
        ScrapyCommand.add_options(self, parser)
        parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", \
            help="set spider argument (may be repeated)")

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError("Invalid -a value, use -a NAME=VALUE",
                             print_help=False)

    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        filename = args[0]
        if not os.path.exists(filename):
            raise UsageError("File not found: %s\n" % filename)
        try:
            module = _import_file(filename)
        except (ImportError, ValueError), e:
            raise UsageError("Unable to load %r: %s\n" % (filename, e))
        spclasses = list(iter_spider_classes(module))
        if not spclasses:
            raise UsageError("No spider found in file: %s\n" % filename)
        spider = spclasses.pop()(**opts.spargs)

        self.crawler.crawl(spider)
        self.crawler.start()
Ejemplo n.º 22
0
    def run(self, args, opts):
        if len(args) != 1:
            raise UsageError()
        filename = args[0]
        if not os.path.exists(filename):
            raise UsageError(f"File not found: {filename}\n")
        try:
            module = _import_file(filename)
        except (ImportError, ValueError) as e:
            raise UsageError(f"Unable to load {filename!r}: {e}\n")
        spclasses = list(iter_spider_classes(module))
        if not spclasses:
            raise UsageError(f"No spider found in file: {filename}\n")
        spidercls = spclasses.pop()

        self.crawler_process.crawl(spidercls, **opts.spargs)
        self.crawler_process.start()

        if self.crawler_process.bootstrap_failed:
            self.exitcode = 1
Ejemplo n.º 23
0
def find_spiders(type=None):
    """Find all classes that subclass scrapy.Spider

    If type is given then the output is filtered by type.
    Possible type values ['vk', 'site'].
    """

    spider_map = {}

    def _get_spiders(spiders, spider_map):
        """Returns a list of all spiders with unique name found in a module

        If 2 spiders with the same name are found, that subclass one another,
        then the child one is taken (based on mro)
        """
        # if two spiders with the same name are found, then take the one that
        # subclasses the autogenerated
        for s in spiders:
            if s.name in spider_map:
                # leave only the one that subclasses parent
                old = spider_map[s.name]
                if old in s.mro():
                    spider_map[s.name] = s
            else:
                spider_map[s.name] = s
        # the same one as passed with new values
        return spider_map

    for module in misc.walk_modules(settings.NEWSPIDER_MODULE):
        # crawl responsibly
        spiders = [s for s in spider.iter_spider_classes(module)
                   if s.type == type and type or not type]
        _get_spiders(spiders, spider_map)
    # add user generated modules
    user_spiders = autogenerate.load_spiders_from_json(
        settings.USER_SPIDERS_FILE)
    _get_spiders(user_spiders, spider_map)
    # check for name uniqueness
    return spider_map.values()
Ejemplo n.º 24
0
 def test_iter_spider_classes(self):
     import scrapy.tests.test_utils_spider
     it = iter_spider_classes(scrapy.tests.test_utils_spider)
     self.assertEqual(set(it), {MySpider1, MySpider2})
Ejemplo n.º 25
0
 def _load_spiders(self, module):
     for spcls in iter_spider_classes(module):
         self.spiders.append(spcls.name)
Ejemplo n.º 26
0
    def crawl_in_loop(self, runner):
        """在循环中爬取"""
        # 遍历取出 spider
        spider_list = []
        for spider_class in iter_spider_classes(regex_proxy_spider):
            ip_count = getattr(spider_class, 'ip_count', 0)
            if ip_count > 0:
                spider_list.append(spider_class)
        loop_times = 0
        loop_end_count = 0
        all_loop_proxy_count = 0
        """整个循环中爬取的代理总数"""
        # 无限循环
        while loop_times >= 0:
            loop_times += 1

            # 开始时的数量
            if loop_end_count == 0:
                # 首次获取
                loop_start_count = proxy_manager.count()
            else:
                # 取循环结束时的获取
                loop_start_count = loop_end_count
            log.info(f'第 {loop_times} 轮爬取开始,当前 ip 共 {loop_start_count} 个')

            # 爬取

            spider_end_count = 0
            for i in range(len(spider_list)):
                spider = spider_list[i]
                if spider_end_count == 0:
                    spider_start_count = loop_start_count
                else:
                    spider_start_count = spider_end_count
                log.info(
                    f'第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 开始爬取,'
                    f'当前 ip 共 {spider_start_count} 个')

                spider = spider_list[i]
                try:
                    yield runner.crawl(spider)
                except SystemExit:
                    pass
                sleep_time = 10
                spider_end_count = proxy_manager.count()
                spider_crawled_count = spider_end_count - spider_start_count
                loop_crawled_count = spider_end_count - loop_start_count
                # 单次循环爬取到的数量
                all_loop_proxy_count += loop_crawled_count
                divider = '-' * 10
                log.info(
                    f'{divider}第 {loop_times} 轮,第 {i+1}/{len(spider_list)} 个爬虫 {spider.name} 爬取结束,'
                    f'共爬取到 {spider_crawled_count}/{loop_crawled_count}/{all_loop_proxy_count} 个代理{divider}'
                )
                log.info(f'等待执行下一爬虫,sleep {sleep_time}')
                log.info(f'当前有效代理共 {proxy_manager.available_count()} 个')
                time.sleep(sleep_time)

            # 结束时的数量
            loop_end_count = proxy_manager.count()
            # 延时下一轮
            sleep_time = 60
            log.info(
                f'本轮共爬到 {loop_end_count-loop_start_count}/{loop_end_count} 个代理,等待下一轮,sleep {sleep_time}'
            )
            log.info(f'当前有效代理共 {proxy_manager.available_count()} 个')
            time.sleep(sleep_time)
        reactor.stop()
Ejemplo n.º 27
0
 def run(self, args, opts):
     for module in walk_modules('kingfisher_scrapy.spiders'):
         for cls in iter_spider_classes(module):
             Checker(module, cls).check()
Ejemplo n.º 28
0
 def _load_spiders(self, module):
     for spcls in iter_spider_classes(module):
         self._found[spcls.name].append((module.__name__, spcls.__name__))
         self._spiders[spcls.name] = spcls
Ejemplo n.º 29
0
 def _load_spider(self, spider_path, spider_name=''):
     for spider_cls in iter_spider_classes(spider_path):
         if spider_name and spider_cls.name == spider_name:
             return spider_cls
         else:
             return spider_cls  # return the first spider module
Ejemplo n.º 30
0
 def _load_spiders(self, module):
     for spcls in iter_spider_classes(module):
         self._spiders[spcls.name] = spcls
Ejemplo n.º 31
0
def run_spider(spider_id):
    spider = SpiderModel.objects.get(id=spider_id)
    execution = Execution.objects.create(spider_id=spider_id,
                                         time_started=now())

    user_settings = getattr(settings, 'SCRATCHY_SPIDERS', {})

    item_storage = tempfile.NamedTemporaryFile(delete=False)

    default_settings = {
        'DNS_TIMEOUT': 5,
        'DOWNLOAD_TIMEOUT': 5,
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_MAX_DELAY': 5,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': 1.0,
    }

    internal_settings = {
        'FEED_FORMAT': 'jsonlines',
        'FEED_EXPORT_ENCODING': 'utf-8',
        'FEED_URI': item_storage.name,
    }

    scrapy_settings = {
        **default_settings,
        **user_settings,
        **spider.settings,
        **internal_settings,  # last because these must not be overwritten
    }

    scrapy_spider_cls = None

    module = importlib.import_module(spider.module)

    for cls in iter_spider_classes(module):
        scrapy_spider_cls = cls
        break  # use first valid class in module

    if scrapy_spider_cls is None:
        raise RuntimeError(f'No valid spider class found in module {module}')

    process = CrawlerProcess(settings=None, install_root_handler=False)

    log_capture_string = StringIO()
    log_handler = logging.StreamHandler(log_capture_string)
    log_handler.setLevel(spider.log_level)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    log_handler.setFormatter(formatter)
    scrapy_logger = logging.getLogger('scrapy')
    scrapy_logger.addHandler(log_handler)
    spider_logger = logging.getLogger(
        scrapy_spider_cls.name)  # scrapy uses the spider name as logger id
    spider_logger.addHandler(log_handler)

    crawler = Crawler(scrapy_spider_cls, scrapy_settings)

    process.crawl(crawler)
    process.start()
    # blocks here

    log_contents = log_capture_string.getvalue()
    spider_logger.removeHandler(log_handler)
    scrapy_logger.removeHandler(log_handler)
    log_capture_string.close()

    execution.time_ended = now()
    execution.stats = crawler.stats._stats
    execution.log = log_contents
    execution.save()

    item_storage.seek(0)

    items = []
    for line in item_storage.readlines():
        item_data = json.loads(line)
        item = Item(spider_id=spider_id, execution=execution, data=item_data)
        items.append(item)

    Item.objects.bulk_create(items, batch_size=100)

    item_storage.close()
    os.remove(item_storage.name)
Ejemplo n.º 32
0
 def _load_spiders(self, module):
     # use the built-in function for loading spiders
     for spcls in iter_spider_classes(module):
         self._spiders[spcls.name] = spcls
Ejemplo n.º 33
0
 def test_iter_spider_classes(self):
     import tests.test_utils_spider
     it = iter_spider_classes(tests.test_utils_spider)
     self.assertEqual(set(it), {MySpider1, MySpider2})
Ejemplo n.º 34
0
 def _load_spiders(self, module):
     for spcls in iter_spider_classes(module):
         self._found[spcls.name].append(
             (module.__name__, spcls.__name__))  # 模块的__name__就是模块名
         self._spiders[spcls.name] = spcls  # 爬虫类的name属性,注意不是实例的name
Ejemplo n.º 35
0
 def _load_spiders(self, module):
     for spcls in iter_spider_classes(module):
         self._spiders[spcls.name] = spcls
Ejemplo n.º 36
0
 def _load_spiders(self, module):
     for spcls in iter_spider_classes(
             module):  # 这个类可以判定给定模组内所有的方法里是否有spider的子类
         self._found[spcls.name].append((module.__name__, spcls.__name__))
         self._spiders[spcls.name] = spcls
Ejemplo n.º 37
0
def get_spiders_iter():
    for name in settings.get('SPIDER_MODULES'):
        for module in walk_modules(name):
            for spcls in iter_spider_classes(module):
                yield spcls
Ejemplo n.º 38
0
def iter_spider_from_module(modules):
    """返回包含指定模块下的所有 spider 类的生成器"""
    for m in walk_modules(modules):
        yield from iter_spider_classes(m)
Ejemplo n.º 39
0
from sqlalchemy.sql import text

HERE = os.path.dirname(os.path.abspath(__file__))
product_spiders_root = os.path.dirname(HERE)
project_root = os.path.dirname(product_spiders_root)

sys.path.append(project_root)
sys.path.append(os.path.join(project_root, 'product_spiders'))

from product_spiders.db import Session
from scrapy.utils.misc import walk_modules
from scrapy.utils.spider import iter_spider_classes
from productspidersweb.models import Spider

print sys.path
here = os.path.abspath(os.path.dirname(__file__))

db_session = Session()

spider_modules = ['product_spiders.spiders']

for name in spider_modules:
    for module in walk_modules(name):
        for spider in iter_spider_classes(module):
            sp = db_session.query(Spider).filter(
                Spider.name == spider.name).first()
            if sp:
                sp.module = str(spider.__module__)
                db_session.add(sp)

db_session.commit()
Ejemplo n.º 40
0
 def all_true_case(self, module1):
     self._result = list(iter_spider_classes(module1))
Ejemplo n.º 41
0
 def test_iter_spider_classes(self):
     import tests.test_utils_spider
     it = iter_spider_classes(tests.test_utils_spider)
     self.assertEqual(set(it), set([MySpider1, MySpider2]))
Ejemplo n.º 42
0
 def _load_spiders(self, module):
     for spcls in iter_spider_classes(module):
         self._found[spcls.name].append((module.__name__, spcls.__name__))
         self._spiders[spcls.name] = spcls
Ejemplo n.º 43
0
 def d_false_others_true_case(self, module4):
     self._result = list(iter_spider_classes(module4))