def close_spider(self, spider): # Do compaction each time to save space and also recreate files to # avoid them being removed in storage with timestamp-based autoremoval. if self.db is not None: self.db.compact_range() del self.db garbage_collect()
def close_spider(self, spider): # Do compactation each time to save space and also recreate files to # avoid them being removed in storages with timestamp-based # autoremoval. self.db.CompactRange() del self.db garbage_collect()
def scrapy(): try: execute() finally: # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() # on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies garbage_collect()
def close_spider(self, spider): # Do compactation each time to save space and also recreate files to # avoid them being removed in storages with timestamp-based autoremoval. if self.dbdriver == 'plyvel': self.db.compact_range() elif self.dbdriver == 'leveldb': self.db.CompactRange() del self.db garbage_collect() super(LeveldbCacheStorage, self).close_spider(spider)
def execute_crawler(identifier): # Runs a crawler from command-line (not working) configure_logging() settings = project.get_project_settings() try: crawl(spider, settings) reactor.run() finally: garbage_collect()
def test_cache_with_limit(self): cache = LocalWeakReferencedCache(limit=2) r1 = Request('https://example.org') r2 = Request('https://example.com') r3 = Request('https://example.net') cache[r1] = 1 cache[r2] = 2 cache[r3] = 3 self.assertEqual(len(cache), 2) self.assertNotIn(r1, cache) self.assertIn(r2, cache) self.assertIn(r3, cache) self.assertEqual(cache[r2], 2) self.assertEqual(cache[r3], 3) del r2 # PyPy takes longer to collect dead references garbage_collect() self.assertEqual(len(cache), 1)
def test_cache_without_limit(self): max = 10**4 cache = LocalWeakReferencedCache() refs = [] for x in range(max): refs.append(Request(f'https://example.org/{x}')) cache[refs[-1]] = x self.assertEqual(len(cache), max) for i, r in enumerate(refs): self.assertIn(r, cache) self.assertEqual(cache[r], i) del r # delete reference to the last object in the list # delete half of the objects, make sure that is reflected in the cache for _ in range(max // 2): refs.pop() # PyPy takes longer to collect dead references garbage_collect() self.assertEqual(len(cache), max // 2) for i, r in enumerate(refs): self.assertIn(r, cache) self.assertEqual(cache[r], i)
def _run_command(cmd, args, opts): if opts.profile: _run_command_profiled(cmd, args, opts) else: cmd.run(args, opts) # 用cpython的porfiler 运行 cmd.run(args, opts) 并传入变量 def _run_command_profiled(cmd, args, opts): if opts.profile: sys.stderr.write( f"scrapy: writing cProfile stats to {opts.profile!r}\n") loc = locals() #拿到所有变量 p = cProfile.Profile() p.runctx('cmd.run(args, opts)', globals(), loc) #调用函数 并传入 global 和 locals 中的变量 if opts.profile: p.dump_stats(opts.profile) if __name__ == '__main__': try: execute() finally: # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() on exit: # http://doc.pypy.org/en/latest/cpython_differences.html # ?highlight=gc.collect#differences-related-to-garbage-collection-strategies garbage_collect()
def main(): """Command line entry point.""" settings = get_project_settings() configure_logging(settings) args = _parse_args() LOGGER.info(args) base_dir = Path(settings["BASE_DIR"]).resolve() cache_dir = base_dir / ".scrapy" / "httpcache" feeds_dir = Path(args.feeds_dir) if args.feeds_dir else base_dir / "feeds" feeds_dir = feeds_dir.resolve() feeds_dir_scraper = ( feeds_dir / args.feeds_subdir if args.feeds_subdir else feeds_dir / args.spider ) file_tag = normalize_space(args.file_tag) out_file = feeds_dir_scraper / "%(class)s" / f"%(time)s{file_tag}.jl" LOGGER.info("Output file will be <%s>", out_file) from_settings = job_dir_from_settings(settings) job_dir = ( Path(args.job_dir) if args.job_dir else Path(from_settings) if from_settings else base_dir / "jobs" / args.spider ) job_dir = job_dir.resolve() cache_dir.mkdir(parents=True, exist_ok=True) feeds_dir_scraper.mkdir(parents=True, exist_ok=True) job_dir.mkdir(parents=True, exist_ok=True) dont_run_before_file = job_dir / ".dont_run_before" dont_run_before = parse_date( args.dont_run_before, tzinfo=timezone.utc ) or date_from_file(dont_run_before_file, tzinfo=timezone.utc) if dont_run_before: LOGGER.info("Don't run before %s", dont_run_before.isoformat()) sleep_seconds = dont_run_before.timestamp() - now().timestamp() if sleep_seconds > 0: LOGGER.info("Going to sleep for %.1f seconds", sleep_seconds) sleep(sleep_seconds) states = _find_states( job_dir, state_file=settings.get("STATE_TAG_FILE") or ".state" ) running = sorted(sub_dir for sub_dir, state in states.items() if state == "running") if len(running) > 1: LOGGER.warning( "Found %d running jobs %s, please check and fix!", len(running), running ) return if running: LOGGER.info("Found a running job <%s>, skipping...", running[0]) return resumable = sorted( sub_dir for sub_dir, state in states.items() if state in RESUMABLE_STATES ) if len(resumable) > 1: LOGGER.warning( "Found %d resumable jobs %s, please check and fix!", len(resumable), resumable, ) return if resumable: LOGGER.info("Resuming previous job <%s>", resumable[0]) job_tag = resumable[0] if resumable else now().strftime(DATE_FORMAT) curr_job = job_dir / job_tag command = [ "scrapy", "crawl", args.spider, "--output", str(out_file), "--set", f"JOBDIR={curr_job}", "--set", f"DONT_RUN_BEFORE_FILE={dont_run_before_file}", ] try: execute(argv=command) finally: garbage_collect()
def close_spider(self, spider): # Do compactation each time to save space and also recreate files to # avoid them being removed in storages with timestamp-based autoremoval. self.db.CompactRange() del self.db garbage_collect()
cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode) def _run_command(cmd, args, opts): if opts.profile: _run_command_profiled(cmd, args, opts) else: cmd.run(args, opts) def _run_command_profiled(cmd, args, opts): if opts.profile: sys.stderr.write("scrapy: writing cProfile stats to %r\n" % opts.profile) loc = locals() p = cProfile.Profile() p.runctx('cmd.run(args, opts)', globals(), loc) if opts.profile: p.dump_stats(opts.profile) if __name__ == '__main__': try: execute() finally: # Twisted prints errors in DebugInfo.__del__, but PyPy does not run gc.collect() # on exit: http://doc.pypy.org/en/latest/cpython_differences.html?highlight=gc.collect#differences-related-to-garbage-collection-strategies garbage_collect()