def get_defaults_spider_mw(self): crawler = get_crawler() spider = Spider('foo') spider.set_crawler(crawler) defaults = dict([(k, [v]) for k, v in \ six.iteritems(crawler.settings.get('DEFAULT_REQUEST_HEADERS'))]) return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
def get_defaults_spider_mw(self): crawler = get_crawler() spider = Spider('foo') spider.set_crawler(crawler) defaults = dict([(k, [v]) for k, v in \ crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()]) return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
class ChunkExtensionTest(object): settings = {} def tearDown(self): self.remove_temp_dir() def start(self, n_items_per_chunk=None, n_items=None, settings=None): # Reset item generator and remove temporary dir ItemGenerator.reset() self.remove_temp_dir() # Setup settings settings = settings or self.settings.copy() if n_items_per_chunk is not None: settings['CHUNKED_FEED_ITEMS_PER_CHUNK'] = n_items_per_chunk # Init Scrapy self.crawler = get_crawler(settings) self.spider = Spider('chunk_test') self.spider.set_crawler(self.crawler) self.extension = ChunkedFeedExporter.from_crawler(self.crawler) self.extension.open_spider(self.spider) # Add items if we have to if n_items: self.add_items(n_items) def stop(self): return self.extension.close_spider(self.spider) def remove_temp_dir(self): shutil.rmtree(EXPORT_TEMP_DIR, ignore_errors=True) def add_items(self, n_items): for i in range(n_items): item = ItemGenerator.generate() self.extension.item_scraped(item, self.spider) def get_chunk_filename(self, chunk): return EXPORT_FILE_PATTERN % {'chunk_number':chunk} def get_chunk_filenames(self): return [f for f in os.listdir(EXPORT_TEMP_DIR) if f.endswith(".json")] def get_number_of_chunks(self): return len(self.get_chunk_filenames()) def get_chunk_content(self, chunk): with open(self.get_chunk_filename(chunk)) as f: return json.load(f) def ensure_number_of_chunks(self, n_chunks): n = self.get_number_of_chunks() assert n_chunks == n, "Wrong number of chunks. found %d, expecting %d" % (n, n_chunks) def ensure_number_of_exported_items_per_chunk(self, chunk, n_items): n_exported_items = len(self.get_chunk_content(chunk)) assert n_items == n_exported_items, "Wrong number of exported items. found %d, expecting %d" % \ (n_exported_items, n_items)
def _export_streamitem(self, values): item = load_item_from_values(values) crawler = get_crawler() spider = Spider('streamitem_test') spider.set_crawler(crawler) storage = StreamItemFileFeedStorage(EXPORT_SC_FILENAME) exporter = StreamItemExporter(file=storage.open(spider)) exporter.start_exporting() exporter.export_item(item) exporter.finish_exporting()
class ManagerTestCase(TestCase): settings_dict = None def setUp(self): self.crawler = get_crawler(self.settings_dict) self.spider = Spider('foo') self.spider.set_crawler(self.crawler) self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider) def tearDown(self): self.crawler.stats.close_spider(self.spider, '') return self.mwman.close_spider(self.spider) def _download(self, request, response=None): """Executes downloader mw manager's download method and returns the result (Request or Response) or raise exception in case of failure. """ if not response: response = Response(request.url) def download_func(**kwargs): return response dfd = self.mwman.download(download_func, request, self.spider) # catch deferred result and return the value results = [] dfd.addBoth(results.append) self._wait(dfd) ret = results[0] if isinstance(ret, Failure): ret.raiseException() return ret
def get_spider_and_mw(self, default_useragent): crawler = get_crawler({'USER_AGENT': default_useragent}) spider = Spider('foo') spider.set_crawler(crawler) return spider, UserAgentMiddleware.from_crawler(crawler)
def get_request_spider_mw(self): crawler = get_crawler() spider = Spider('foo') spider.set_crawler(crawler) request = Request('http://scrapytest.org/') return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)
def get_request_spider_mw(self): crawler = get_crawler() spider = Spider("foo") spider.set_crawler(crawler) request = Request("http://scrapytest.org/") return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)