def test_url_is_from_spider_with_allowed_domains(self): spider = Spider(name='example.com', allowed_domains=['example.org', 'example.net']) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://sub.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.org/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.net/some/page.html', spider)) self.assertFalse( url_is_from_spider('http://www.example.us/some/page.html', spider)) spider = Spider(name='example.com', allowed_domains=set(('example.com', 'example.net'))) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) spider = Spider(name='example.com', allowed_domains=('example.com', 'example.net')) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider))
def get_defaults_spider_mw(self): crawler = get_crawler() spider = Spider('foo') spider.set_crawler(crawler) defaults = dict([(k, [v]) for k, v in \ six.iteritems(crawler.settings.get('DEFAULT_REQUEST_HEADERS'))]) return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
def __init__(self, name=None, **kwargs): Spider.__init__(self, name) self.conn = MySQLdb.connect( host="localhost", user="******", passwd="123456", db="driving", charset="utf8" ) self.cursor = self.conn.cursor() self.redispool = redis.ConnectionPool( host='localhost', port=6379, db=0 ) self.redis = redis.Redis(connection_pool=self.redispool) urls = self.getUrls(); for url in urls: done = self.hasCrawled(url) if done==False: self.start_urls.append(url) self.cacheTodo(url)
def test_scheduler_persistent(self): messages = [] spider = Spider('myspider') spider.log = lambda *args, **kwargs: messages.append([args, kwargs]) self.scheduler.persist = True self.scheduler.open(spider) self.assertEqual(messages, []) self.scheduler.enqueue_request(Request('http://example.com/page1')) self.scheduler.enqueue_request(Request('http://example.com/page2')) self.assertTrue(self.scheduler.has_pending_requests()) self.scheduler.close('finish') self.scheduler.open(spider) self.assertEqual(messages, [ [('Resuming crawl (2 requests scheduled)',), {}], ]) self.assertEqual(len(self.scheduler), 2) self.scheduler.persist = False self.scheduler.close('finish') self.assertEqual(len(self.scheduler), 0)
class ChunkExtensionTest(object): settings = {} def tearDown(self): self.remove_temp_dir() def start(self, n_items_per_chunk=None, n_items=None, settings=None): # Reset item generator and remove temporary dir ItemGenerator.reset() self.remove_temp_dir() # Setup settings settings = settings or self.settings.copy() if n_items_per_chunk is not None: settings['CHUNKED_FEED_ITEMS_PER_CHUNK'] = n_items_per_chunk # Init Scrapy self.crawler = get_crawler(settings) self.spider = Spider('chunk_test') self.spider.set_crawler(self.crawler) self.extension = ChunkedFeedExporter.from_crawler(self.crawler) self.extension.open_spider(self.spider) # Add items if we have to if n_items: self.add_items(n_items) def stop(self): return self.extension.close_spider(self.spider) def remove_temp_dir(self): shutil.rmtree(EXPORT_TEMP_DIR, ignore_errors=True) def add_items(self, n_items): for i in range(n_items): item = ItemGenerator.generate() self.extension.item_scraped(item, self.spider) def get_chunk_filename(self, chunk): return EXPORT_FILE_PATTERN % {'chunk_number':chunk} def get_chunk_filenames(self): return [f for f in os.listdir(EXPORT_TEMP_DIR) if f.endswith(".json")] def get_number_of_chunks(self): return len(self.get_chunk_filenames()) def get_chunk_content(self, chunk): with open(self.get_chunk_filename(chunk)) as f: return json.load(f) def ensure_number_of_chunks(self, n_chunks): n = self.get_number_of_chunks() assert n_chunks == n, "Wrong number of chunks. found %d, expecting %d" % (n, n_chunks) def ensure_number_of_exported_items_per_chunk(self, chunk, n_items): n_exported_items = len(self.get_chunk_content(chunk)) assert n_items == n_exported_items, "Wrong number of exported items. found %d, expecting %d" % \ (n_exported_items, n_items)
def get_defaults_spider_mw(self): crawler = get_crawler() spider = Spider('foo') spider.set_crawler(crawler) defaults = dict([(k, [v]) for k, v in \ crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()]) return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
def setUp(self): self.spider1 = Spider('name1') self.spider2 = Spider('name2') open_spiders = set([self.spider1, self.spider2]) crawler = CrawlerMock(open_spiders) self.spref = SpiderReferencer(crawler) self.encoder = ScrapyJSONEncoder(spref=self.spref) self.decoder = ScrapyJSONDecoder(spref=self.spref)
def __init__(self, name=None, **kwargs): Spider.__init__(self, name, **kwargs) self.db = MySQLdb.connect(host="localhost", user="******", passwd="12345689", db="zhaopin", charset='utf8') self.cursor = self.db.cursor()
def setUp(self): self.crawler = get_crawler(self.settings_dict) self.spider = Spider('foo') self.spider.set_crawler(self.crawler) self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider)
def _export_streamitem(self, values): item = load_item_from_values(values) crawler = get_crawler() spider = Spider('streamitem_test') spider.set_crawler(crawler) storage = StreamItemFileFeedStorage(EXPORT_SC_FILENAME) exporter = StreamItemExporter(file=storage.open(spider)) exporter.start_exporting() exporter.export_item(item) exporter.finish_exporting()
def test_host_header_seted_in_request_headers(self): def _test(response): self.assertEquals(response.body, 'example.com') self.assertEquals(request.headers.get('Host'), 'example.com') request = Request(self.getURL('host'), headers={'Host': 'example.com'}) return self.download_request(request, Spider('foo')).addCallback(_test) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, 'example.com') return d
def test_download_with_maxsize(self): request = Request(self.getURL('file')) # 10 is minimal size for this request and the limit is only counted on # response body. (regardless of headers) d = self.download_request(request, Spider('foo', download_maxsize=10)) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, "0123456789") yield d d = self.download_request(request, Spider('foo', download_maxsize=9)) yield self.assertFailure(d, defer.CancelledError, error.ConnectionAborted)
def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware( Settings({'HTTPERROR_ALLOWED_CODES': (402, )})) self.req = Request('http://scrapytest.org') self.res200, self.res404, self.res402 = _responses( self.req, [200, 404, 402])
class Bot(Resource): spider = Spider('slyd') def __init__(self, settings, spec_manager): # twisted base class is old-style so we cannot user super() Resource.__init__(self) self.spec_manager = spec_manager settings.set('PLUGINS', [p['bot'] for p in settings.get('PLUGINS')]) # initialize scrapy crawler crawler = Crawler(settings) crawler.configure() crawler.signals.connect(self.keep_spider_alive, signals.spider_idle) crawler.crawl(self.spider) crawler.start() self.crawler = crawler log.msg("bot initialized", level=log.DEBUG) def keep_spider_alive(self, spider): raise DontCloseSpider("keeping it open") def stop(self): """Stop the crawler""" self.crawler.stop() log.msg("bot stopped", level=log.DEBUG)
def _assert_stores(self, storage, path): spider = Spider("default") file = storage.open(spider) file.write("content") yield storage.store(file) self.failUnless(os.path.exists(path)) self.failUnlessEqual(open(path).read(), "content")
def test_store(self): out = StringIO() storage = StdoutFeedStorage('stdout:', _stdout=out) file = storage.open(Spider("default")) file.write("content") yield storage.store(file) self.assertEqual(out.getvalue(), "content")
def test_payload(self): body = '1' * 100 # PayloadResource requires body length to be 100 request = Request(self.getURL('payload'), method='POST', body=body) d = self.download_request(request, Spider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, body) return d
def setUp(self): self.spider = Spider('scrapytest.org') self.stats = StatsCollector(get_crawler()) self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True)
def test_host_header_not_in_request_headers(self): def _test(response): self.assertEquals(response.body, '127.0.0.1:%d' % self.portno) self.assertEquals(request.headers, {}) request = Request(self.getURL('host')) return self.download_request(request, Spider('foo')).addCallback(_test)
def test_download_without_proxy(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, '/path/to/resource') request = Request(self.getURL('path/to/resource')) return self.download_request(request, Spider('foo')).addCallback(_test)
def __init__(self, **kwargs): Spider.__init__(self, **kwargs) self.config_file = kwargs.get('config_file', None) config = kwargs.get('config', None) if self.config_file: jconfig = jsonload(open(self.config_file)) elif config: jconfig = jsonloads(config) else: self.log('config_file or config is expected', level=log.CRITICAL) raise Exception('config_file or config is expected') self.template = config_parse(jconfig) # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面 self.specify_url = kwargs.get('specify_url', None)
def _assert_stores(self, storage, path): spider = Spider("default") file = storage.open(spider) file.write(b"content") yield storage.store(file) self.assertTrue(os.path.exists(path)) with open(path, 'rb') as fp: self.assertEqual(fp.read(), b"content")
def test_store_load(self): jobdir = self.mktemp() os.mkdir(jobdir) spider = Spider(name='default') dt = datetime.now() ss = SpiderState(jobdir) ss.spider_opened(spider) spider.state['one'] = 1 spider.state['dt'] = dt ss.spider_closed(spider) spider2 = Spider(name='default') ss2 = SpiderState(jobdir) ss2.spider_opened(spider2) self.assertEqual(spider.state, {'one': 1, 'dt': dt}) ss2.spider_closed(spider2)
def test_state_attribute(self): # state attribute must be present if jobdir is not set, to provide a # consistent interface spider = Spider(name='default') ss = SpiderState() ss.spider_opened(spider) self.assertEqual(spider.state, {}) ss.spider_closed(spider)
def test_download(self): def _test(response): self.assertEquals(response.url, request.url) self.assertEquals(response.status, 200) self.assertEquals(response.body, '0123456789') request = Request(path_to_file_uri(self.tmpname + '^')) assert request.url.upper().endswith('%5E') return self.download_request(request, Spider('foo')).addCallback(_test)
def test_download_with_proxy_https_noconnect(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, 'https://example.com') http_proxy = '%s?noconnect' % self.getURL('') request = Request('https://example.com', meta={'proxy': http_proxy}) return self.download_request(request, Spider('foo')).addCallback(_test)
def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware(Settings({})) self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req
def __init__(self, name=None, **kwargs): Spider.__init__(self, name) self.dbpool = adbapi.ConnectionPool('MySQLdb', db = 'driving', user = '******', passwd = '123456', cursorclass = MySQLdb.cursors.DictCursor, charset = 'utf8', use_unicode = False ) specialCities = [110000, 120000, 310000, 500000]; cities = json.loads(self.jsonStr) for city in cities: if city['parent'] or (city['code'] in specialCities): self.start_urls.append(''.join(['http://jiaxiao.jiaxiaozhijia.com/',city['pinyin']])) self.city_codes[city['pinyin']] = city['code']
def _assert_stores(self, storage, path): spider = Spider("default") file = storage.open(spider) file.write("content") yield storage.store(file) self.failUnless(os.path.exists(path)) self.failUnlessEqual(open(path).read(), "content") # again, to check s3 objects are overwritten yield storage.store(StringIO("new content")) self.failUnlessEqual(open(path).read(), "new content")
def setUp(self): self.environ = os.environ.copy() self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"]) def _log(x): print x self.spider.log = _log self.response = HtmlResponse(body="<html></html>", url="http://www.example.com/product/8798732") self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})
def test_process_spider_output(self): res = Response('http://scrapytest.org') short_url_req = Request('http://scrapytest.org/') long_url_req = Request('http://scrapytest.org/this_is_a_long_url') reqs = [short_url_req, long_url_req] mw = UrlLengthMiddleware(maxlength=25) spider = Spider('foo') out = list(mw.process_spider_output(res, reqs, spider)) self.assertEquals(out, [short_url_req])
def test_timeout_download_from_spider(self): spider = Spider('foo') meta = {'download_timeout': 0.2} # client connects but no data is received request = Request(self.getURL('wait'), meta=meta) d = self.download_request(request, spider) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError) # client connects, server send headers and some body bytes but hangs request = Request(self.getURL('hang-after-headers'), meta=meta) d = self.download_request(request, spider) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
def setUp(self): self.spider = Spider('foo') self.mw = HttpErrorMiddleware( Settings({'HTTPERROR_ALLOWED_CODES': (402, )})) self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req self.res402 = Response('http://scrapytest.org', status=402) self.res402.request = self.req
def _assert_stores(self, storage, path): spider = Spider("default") file = storage.open(spider) file.write(b"content") yield storage.store(file) self.assertTrue(os.path.exists(path)) with open(path, 'rb') as fp: self.assertEqual(fp.read(), b"content") # again, to check s3 objects are overwritten yield storage.store(BytesIO(b"new content")) with open(path, 'rb') as fp: self.assertEqual(fp.read(), b"new content")
def _open_spider(self, request, spider): if self.spider: return self.spider if spider is None: spider = create_spider_for_request(self.crawler.spiders, request, Spider('default'), log_multiple=True) spider.set_crawler(self.crawler) self.crawler.engine.open_spider(spider, close_if_idle=False) self.spider = spider return spider
def setUp(self): self.yesterday = email.utils.formatdate(time.time() - 86400) self.today = email.utils.formatdate() self.tomorrow = email.utils.formatdate(time.time() + 86400) self.crawler = get_crawler() self.spider = Spider('example.com') self.tmpdir = tempfile.mkdtemp() self.request = Request('http://www.example.com', headers={'User-Agent': 'test'}) self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202) self.crawler.stats.open_spider(self.spider)
class ManagerTestCase(TestCase): settings_dict = None def setUp(self): self.crawler = get_crawler(self.settings_dict) self.spider = Spider('foo') self.spider.set_crawler(self.crawler) self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider) def tearDown(self): self.crawler.stats.close_spider(self.spider, '') return self.mwman.close_spider(self.spider) def _download(self, request, response=None): """Executes downloader mw manager's download method and returns the result (Request or Response) or raise exception in case of failure. """ if not response: response = Response(request.url) def download_func(**kwargs): return response dfd = self.mwman.download(download_func, request, self.spider) # catch deferred result and return the value results = [] dfd.addBoth(results.append) self._wait(dfd) ret = results[0] if isinstance(ret, Failure): ret.raiseException() return ret
def start(self, n_items_per_chunk=None, n_items=None, settings=None): # Reset item generator and remove temporary dir ItemGenerator.reset() self.remove_temp_dir() # Setup settings settings = settings or self.settings.copy() if n_items_per_chunk is not None: settings['CHUNKED_FEED_ITEMS_PER_CHUNK'] = n_items_per_chunk # Init Scrapy self.crawler = get_crawler(settings) self.spider = Spider('chunk_test') self.spider.set_crawler(self.crawler) self.extension = ChunkedFeedExporter.from_crawler(self.crawler) self.extension.open_spider(self.spider) # Add items if we have to if n_items: self.add_items(n_items)
def __init__(self): Spider.__init__(self) self.verificationErrors = [] self.driver = webdriver.Firefox()
def __init__(self): Spider.__init__(self) self.browser = webdriver.Firefox() self.cursor.execute('create table if not exists CleaningAgents (cleaningAgentID int primary key, name varchar(20),description varchar(20),instruction varchar(20),application Time long,frequency long,cleaningAgentType varchar(20))')
def get_request_spider_mw(self): crawler = get_crawler() spider = Spider("foo") spider.set_crawler(crawler) request = Request("http://scrapytest.org/") return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)
def __init__(self, config, **kwargs): Spider.__init__(self, **kwargs) self.config_file = kwargs.get('config_file') self.config = FocusedCrawlerConfigure(config, self.config_file).config
def __del__(self): self.driver.quit() print self.verificationErrors Spider.__del__(self)
def get_spider_and_mw(self, default_useragent): crawler = get_crawler({'USER_AGENT': default_useragent}) spider = Spider('foo') spider.set_crawler(crawler) return spider, UserAgentMiddleware.from_crawler(crawler)
def __init__(self): Spider.__init__(self)