def run(self, args, opts): if len(args) != 2: return False output = args[1] file = open(output, 'w+b') root, ext = os.path.splitext(output) exporter = { '.json': JsonLinesItemExporter, '.xml': XmlItemExporter, '.csv': CsvItemExporter, '.pickle': PickleItemExporter }[ext](file) dispatcher.connect(exporter.export_item, signal=signals.item_passed) exporter.start_exporting() SPIDER.domain_name = args[0] if opts.alias == None: SPIDER.aliases = [] else: SPIDER.aliases = opts.alias if opts.start_url == None: SPIDER.start_urls = ['http://%s'%SPIDER.domain_name] else: SPIDER.start_urls = opts.start_url scrapymanager.runonce(SPIDER) exporter.finish_exporting()
def run(self): if not self.portno: self.port = start_test_site() self.portno = self.port.getHost().port else: self.port = start_test_site(self.portno) self.spider.start_urls = [self.geturl("/")] scrapymanager.configure() scrapymanager.runonce(self.spider)
def run(self, args, opts): if len(args) != 1: return False if opts.output: file = open(opts.output, 'w+b') exporter = XmlItemExporter(file) dispatcher.connect(exporter.export_item, signal=signals.item_passed) exporter.start_exporting() module = _import_file(args[0]) scrapymanager.runonce(module.SPIDER) if opts.output: exporter.finish_exporting()
def fetch(urls): """Fetch a list of urls and return a list of the downloaded Scrapy Responses. This is a blocking function not suitable for calling from spiders. Instead, it is indended to be called from outside the framework such as Scrapy commands or standalone scripts. """ responses = [] requests = [Request(url, callback=responses.append, dont_filter=True) \ for url in urls] scrapymanager.runonce(*requests) return responses
def run(self, args, opts): # if not domain argument was given, then exit if len(args) == 0: return False domain = args[0] # create directory to store the report output path = [opts.output, domain, opts.name] parent = '' for d in path: parent = os.path.join(parent, d) if not os.path.exists(parent): os.mkdir(parent) log.msg('Created directory: %s'%parent, level=log.INFO) output = os.path.join(*path) reports = ['errors', 'dirtyurls', 'offsite', 'clean'] # create exporter for each report type exporters = {} for report in reports: file = open(os.path.join(output, '%s.%s'%(report, opts.format)), 'w+b') exporter = { 'json': JsonLinesItemExporter, 'xml': XmlItemExporter, 'csv': CsvItemExporter, 'pickle': PickleItemExporter }[opts.format](file) exporters[report] = exporter junction = ReportJunction(exporters) dispatcher.connect(junction, signal=signals.item_passed) junction.start() SPIDER.domain_name = domain if opts.alias == None: SPIDER.aliases = [] else: SPIDER.aliases = opts.alias if opts.start_url == None: SPIDER.start_urls = ['http://%s'%SPIDER.domain_name] else: SPIDER.start_urls = opts.start_url scrapymanager.runonce(SPIDER) junction.finish()
def run(self): self.port = start_test_site() self.portno = self.port.getHost().port self.spider = TestSpider() if self.spider: self.spider.start_urls = [ self.geturl("/"), self.geturl("/redirect"), ] dispatcher.connect(self.record_signal, signals.engine_started) dispatcher.connect(self.record_signal, signals.engine_stopped) dispatcher.connect(self.record_signal, signals.spider_opened) dispatcher.connect(self.record_signal, signals.spider_idle) dispatcher.connect(self.record_signal, signals.spider_closed) dispatcher.connect(self.item_scraped, signals.item_scraped) dispatcher.connect(self.request_received, signals.request_received) dispatcher.connect(self.response_downloaded, signals.response_downloaded) scrapymanager.configure() scrapymanager.runonce(self.spider) self.port.stopListening() self.wasrun = True
def run(self, args, opts): scrapymanager.runonce(*args)
def crawl(self, *args): scrapymanager.runonce(*args)