def run(self, args, opts): url = args[0] if args else None shell = Shell(self.crawler, update_vars=self.update_vars, code=opts.code) self._start_crawler_thread() shell.start(url=url)
def run(self, args, opts): url = args[0] if args else None spiders = self.crawler_process.spiders spidercls = DefaultSpider if opts.spider: spidercls = spiders.load(opts.spider) elif url: spidercls = spidercls_for_request(spiders, Request(url), spidercls, log_multiple=True) # The crawler is created this way since the Shell manually handles the # crawling engine, so the set up in the crawl method won't work crawler = self.crawler_process._create_logged_crawler(spidercls) # The Shell class needs a persistent engine in the crawler crawler.engine = crawler._create_engine() crawler.engine.start() self.crawler_process.start(start_reactor=False) self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) shell.start(url=url)
def run(self, args, opts): url = args[0] if args else None if url: # first argument may be a local file url = guess_scheme(url) spider_loader = self.crawler_process.spider_loader spidercls = DefaultSpider if opts.spider: spidercls = spider_loader.load(opts.spider) elif url: spidercls = spidercls_for_request(spider_loader, Request(url), spidercls, log_multiple=True) # The crawler is created this way since the Shell manually handles the # crawling engine, so the set up in the crawl method won't work crawler = self.crawler_process._create_crawler(spidercls) # The Shell class needs a persistent engine in the crawler crawler.engine = crawler._create_engine() crawler.engine.start() self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) shell.start(url=url, redirect=not opts.no_redirect)
def test_inspect_response_text(self): response = TextResponse(url='http://example.com/', body=''' {"hello": "world"} ''') shell = Shell(self.crawler, code='None') shell.start(response=response, spider=self.spider) self.assertNotIn('sel', shell.vars)
def run(self, args, opts): url = args[0] if args else None spider = None if opts.spider: spider = self.crawler.spiders.create(opts.spider) shell = Shell(self.crawler, update_vars=self.update_vars, code=opts.code) self._start_crawler_thread() shell.start(url=url, spider=spider)
def test_inspect_response_xml(self): response = XmlResponse(url='http://example.com/', body=''' <?xml version="1.0" encoding="UTF-8"?> <foo>Testing</foo> ''') shell = Shell(self.crawler, code='None') shell.start(response=response, spider=self.spider) self.assertIn('sel', shell.vars)
def test_inspect_response_binary(self): response = Response(url='http://example.com/', body=''' '{\xcc\xe8\x92\xe6\xb8\xa21\xb2\xe5O6\xc9\x84\xba8 \xa3\x877\xa8v\xee9p.UJ\xa1m\x8a"H\xb3\xcc\x08\xff \x87d\x00i\xce\xb7a\xff\x8c\xd8NX\xae\xc2' ''') shell = Shell(self.crawler, code='None') shell.start(response=response, spider=self.spider) self.assertNotIn('sel', shell.vars)
def run(self, args, opts): crawler = self.crawler_process.create_crawler() url = args[0] if args else None spider = crawler.spiders.create(opts.spider) if opts.spider else None self.crawler_process.start_crawling() self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) shell.start(url=url, spider=spider)
def test_inspect_response_html(self): response = HtmlResponse(url='http://example.com/', body=''' <!doctype html> <html> <p>Testing</p> </html> ''') shell = Shell(self.crawler, code='None') shell.start(response=response, spider=self.spider) self.assertIn('sel', shell.vars)
def shell(argv): """ Open a url in the scrapy shell """ parser = argparse.ArgumentParser('ozzy shell', description=shell.__doc__) parser.add_argument('url', help="URL to open in a shell") args = parser.parse_args(argv) crawler_process = CrawlerProcess(load_settings()) crawler = crawler_process.create_crawler() crawler_process.start_crawling() thread = Thread(target=crawler_process.start_reactor) thread.daemon = True thread.start() sh = Shell(crawler) sh.start(url=args.url)
def run(self, args, opts): url = args[0] if args else None shell = Shell(self.crawler, update_vars=self.update_vars, inthread=True, \ code=opts.code) def err(f): log.err(f, "Shell error") self.exitcode = 1 d = shell.start(url=url) d.addErrback(err) d.addBoth(lambda _: self.crawler.stop()) self.crawler.start()
def run(self, args, opts): url = args[0] if args else None shell = Shell(self.update_vars) shell.start(url)