Esempio n. 1
0
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = Spider(name='example.com',
                        allowed_domains=['example.org', 'example.net'])
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://sub.example.com/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.org/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.net/some/page.html',
                               spider))
        self.assertFalse(
            url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = Spider(name='example.com',
                        allowed_domains=set(('example.com', 'example.net')))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))

        spider = Spider(name='example.com',
                        allowed_domains=('example.com', 'example.net'))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))
Esempio n. 2
0
 def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = Spider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         six.iteritems(crawler.settings.get('DEFAULT_REQUEST_HEADERS'))])
     return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
Esempio n. 3
0
    def __init__(self, name=None, **kwargs):
        Spider.__init__(self, name)

        self.conn = MySQLdb.connect(
            host="localhost", 
            user="******", 
            passwd="123456", 
            db="driving", 
            charset="utf8"
        )
        self.cursor = self.conn.cursor()

        self.redispool = redis.ConnectionPool(
            host='localhost', 
            port=6379, 
            db=0
        )

        self.redis = redis.Redis(connection_pool=self.redispool)

        urls = self.getUrls();
        for url in urls:
            done = self.hasCrawled(url)
            if done==False:
                self.start_urls.append(url)
                self.cacheTodo(url)
Esempio n. 4
0
    def test_scheduler_persistent(self):
        messages = []
        spider = Spider('myspider')
        spider.log = lambda *args, **kwargs: messages.append([args, kwargs])

        self.scheduler.persist = True
        self.scheduler.open(spider)

        self.assertEqual(messages, [])

        self.scheduler.enqueue_request(Request('http://example.com/page1'))
        self.scheduler.enqueue_request(Request('http://example.com/page2'))

        self.assertTrue(self.scheduler.has_pending_requests())
        self.scheduler.close('finish')

        self.scheduler.open(spider)
        self.assertEqual(messages, [
            [('Resuming crawl (2 requests scheduled)',), {}],
        ])
        self.assertEqual(len(self.scheduler), 2)

        self.scheduler.persist = False
        self.scheduler.close('finish')

        self.assertEqual(len(self.scheduler), 0)
Esempio n. 5
0
    def test_scheduler_persistent(self):
        messages = []
        spider = Spider('myspider')
        spider.log = lambda *args, **kwargs: messages.append([args, kwargs])

        self.scheduler.persist = True
        self.scheduler.open(spider)

        self.assertEqual(messages, [])

        self.scheduler.enqueue_request(Request('http://example.com/page1'))
        self.scheduler.enqueue_request(Request('http://example.com/page2'))

        self.assertTrue(self.scheduler.has_pending_requests())
        self.scheduler.close('finish')

        self.scheduler.open(spider)
        self.assertEqual(messages, [
            [('Resuming crawl (2 requests scheduled)',), {}],
        ])
        self.assertEqual(len(self.scheduler), 2)

        self.scheduler.persist = False
        self.scheduler.close('finish')

        self.assertEqual(len(self.scheduler), 0)
Esempio n. 6
0
class ChunkExtensionTest(object):
    settings = {}

    def tearDown(self):
        self.remove_temp_dir()

    def start(self, n_items_per_chunk=None, n_items=None, settings=None):

        # Reset item generator and remove temporary dir
        ItemGenerator.reset()
        self.remove_temp_dir()

        # Setup settings
        settings = settings or self.settings.copy()
        if n_items_per_chunk is not None:
            settings['CHUNKED_FEED_ITEMS_PER_CHUNK'] = n_items_per_chunk

        # Init Scrapy
        self.crawler = get_crawler(settings)
        self.spider = Spider('chunk_test')
        self.spider.set_crawler(self.crawler)
        self.extension = ChunkedFeedExporter.from_crawler(self.crawler)
        self.extension.open_spider(self.spider)

        # Add items if we have to
        if n_items:
            self.add_items(n_items)

    def stop(self):
        return self.extension.close_spider(self.spider)

    def remove_temp_dir(self):
        shutil.rmtree(EXPORT_TEMP_DIR, ignore_errors=True)

    def add_items(self, n_items):
        for i in range(n_items):
            item = ItemGenerator.generate()
            self.extension.item_scraped(item, self.spider)

    def get_chunk_filename(self, chunk):
        return EXPORT_FILE_PATTERN % {'chunk_number':chunk}

    def get_chunk_filenames(self):
        return [f for f in os.listdir(EXPORT_TEMP_DIR) if f.endswith(".json")]

    def get_number_of_chunks(self):
        return len(self.get_chunk_filenames())

    def get_chunk_content(self, chunk):
        with open(self.get_chunk_filename(chunk)) as f:
            return json.load(f)

    def ensure_number_of_chunks(self, n_chunks):
        n = self.get_number_of_chunks()
        assert n_chunks == n, "Wrong number of chunks. found %d, expecting %d" % (n, n_chunks)

    def ensure_number_of_exported_items_per_chunk(self, chunk, n_items):
        n_exported_items = len(self.get_chunk_content(chunk))
        assert n_items == n_exported_items, "Wrong number of exported items. found %d, expecting %d" % \
                                            (n_exported_items, n_items)
 def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = Spider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
     return defaults, spider, DefaultHeadersMiddleware.from_crawler(crawler)
Esempio n. 8
0
 def setUp(self):
     self.spider1 = Spider('name1')
     self.spider2 = Spider('name2')
     open_spiders = set([self.spider1, self.spider2])
     crawler = CrawlerMock(open_spiders)
     self.spref = SpiderReferencer(crawler)
     self.encoder = ScrapyJSONEncoder(spref=self.spref)
     self.decoder = ScrapyJSONDecoder(spref=self.spref)
Esempio n. 9
0
 def __init__(self, name=None, **kwargs):
     Spider.__init__(self, name, **kwargs)
     self.db = MySQLdb.connect(host="localhost",
         user="******",
         passwd="12345689",
         db="zhaopin",
         charset='utf8')                         
     self.cursor = self.db.cursor()
Esempio n. 10
0
 def setUp(self):
     self.crawler = get_crawler(self.settings_dict)
     self.spider = Spider('foo')
     self.spider.set_crawler(self.crawler)
     self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
     # some mw depends on stats collector
     self.crawler.stats.open_spider(self.spider)
     return self.mwman.open_spider(self.spider)
Esempio n. 11
0
 def _export_streamitem(self, values):
     item = load_item_from_values(values)
     crawler = get_crawler()
     spider = Spider('streamitem_test')
     spider.set_crawler(crawler)
     storage = StreamItemFileFeedStorage(EXPORT_SC_FILENAME)
     exporter = StreamItemExporter(file=storage.open(spider))
     exporter.start_exporting()
     exporter.export_item(item)
     exporter.finish_exporting()
    def test_host_header_seted_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, 'example.com')
            self.assertEquals(request.headers.get('Host'), 'example.com')

        request = Request(self.getURL('host'), headers={'Host': 'example.com'})
        return self.download_request(request, Spider('foo')).addCallback(_test)

        d = self.download_request(request, Spider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, 'example.com')
        return d
Esempio n. 13
0
    def test_download_with_maxsize(self):
        request = Request(self.getURL('file'))

        # 10 is minimal size for this request and the limit is only counted on
        # response body. (regardless of headers)
        d = self.download_request(request, Spider('foo', download_maxsize=10))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, "0123456789")
        yield d

        d = self.download_request(request, Spider('foo', download_maxsize=9))
        yield self.assertFailure(d, defer.CancelledError,
                                 error.ConnectionAborted)
 def setUp(self):
     self.spider = Spider('foo')
     self.mw = HttpErrorMiddleware(
         Settings({'HTTPERROR_ALLOWED_CODES': (402, )}))
     self.req = Request('http://scrapytest.org')
     self.res200, self.res404, self.res402 = _responses(
         self.req, [200, 404, 402])
Esempio n. 15
0
class Bot(Resource):
    spider = Spider('slyd')

    def __init__(self, settings, spec_manager):
        # twisted base class is old-style so we cannot user super()
        Resource.__init__(self)
        self.spec_manager = spec_manager
        settings.set('PLUGINS', [p['bot'] for p in settings.get('PLUGINS')])
        # initialize scrapy crawler
        crawler = Crawler(settings)
        crawler.configure()
        crawler.signals.connect(self.keep_spider_alive, signals.spider_idle)
        crawler.crawl(self.spider)
        crawler.start()

        self.crawler = crawler
        log.msg("bot initialized", level=log.DEBUG)

    def keep_spider_alive(self, spider):
        raise DontCloseSpider("keeping it open")

    def stop(self):
        """Stop the crawler"""
        self.crawler.stop()
        log.msg("bot stopped", level=log.DEBUG)
Esempio n. 16
0
 def _assert_stores(self, storage, path):
     spider = Spider("default")
     file = storage.open(spider)
     file.write("content")
     yield storage.store(file)
     self.failUnless(os.path.exists(path))
     self.failUnlessEqual(open(path).read(), "content")
Esempio n. 17
0
 def test_store(self):
     out = StringIO()
     storage = StdoutFeedStorage('stdout:', _stdout=out)
     file = storage.open(Spider("default"))
     file.write("content")
     yield storage.store(file)
     self.assertEqual(out.getvalue(), "content")
 def test_payload(self):
     body = '1' * 100  # PayloadResource requires body length to be 100
     request = Request(self.getURL('payload'), method='POST', body=body)
     d = self.download_request(request, Spider('foo'))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEquals, body)
     return d
    def setUp(self):
        self.spider = Spider('scrapytest.org')

        self.stats = StatsCollector(get_crawler())
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)
    def test_host_header_not_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
            self.assertEquals(request.headers, {})

        request = Request(self.getURL('host'))
        return self.download_request(request, Spider('foo')).addCallback(_test)
Esempio n. 21
0
 def setUp(self):
     self.crawler = get_crawler(self.settings_dict)
     self.spider = Spider('foo')
     self.spider.set_crawler(self.crawler)
     self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
     # some mw depends on stats collector
     self.crawler.stats.open_spider(self.spider)
     return self.mwman.open_spider(self.spider)
    def test_download_without_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, '/path/to/resource')

        request = Request(self.getURL('path/to/resource'))
        return self.download_request(request, Spider('foo')).addCallback(_test)
Esempio n. 23
0
    def __init__(self, **kwargs):
        Spider.__init__(self, **kwargs)

        self.config_file = kwargs.get('config_file', None)
        config = kwargs.get('config', None)
        if self.config_file:
            jconfig = jsonload(open(self.config_file))
        elif config:
            jconfig = jsonloads(config)
        else:
            self.log('config_file or config is expected', level=log.CRITICAL)
            raise Exception('config_file or config is expected')

        self.template = config_parse(jconfig)

        # 指定单个要爬的入口地址,可用于测试,或者单独爬取某个页面
        self.specify_url = kwargs.get('specify_url', None)
Esempio n. 24
0
 def _assert_stores(self, storage, path):
     spider = Spider("default")
     file = storage.open(spider)
     file.write(b"content")
     yield storage.store(file)
     self.assertTrue(os.path.exists(path))
     with open(path, 'rb') as fp:
         self.assertEqual(fp.read(), b"content")
Esempio n. 25
0
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = Spider(name='default')
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state['one'] = 1
        spider.state['dt'] = dt
        ss.spider_closed(spider)

        spider2 = Spider(name='default')
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {'one': 1, 'dt': dt})
        ss2.spider_closed(spider2)
Esempio n. 26
0
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = Spider(name='default')
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state['one'] = 1
        spider.state['dt'] = dt
        ss.spider_closed(spider)

        spider2 = Spider(name='default')
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {'one': 1, 'dt': dt})
        ss2.spider_closed(spider2)
Esempio n. 27
0
 def test_state_attribute(self):
     # state attribute must be present if jobdir is not set, to provide a
     # consistent interface
     spider = Spider(name='default')
     ss = SpiderState()
     ss.spider_opened(spider)
     self.assertEqual(spider.state, {})
     ss.spider_closed(spider)
    def test_download(self):
        def _test(response):
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.status, 200)
            self.assertEquals(response.body, '0123456789')

        request = Request(path_to_file_uri(self.tmpname + '^'))
        assert request.url.upper().endswith('%5E')
        return self.download_request(request, Spider('foo')).addCallback(_test)
    def test_download_with_proxy_https_noconnect(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, 'https://example.com')

        http_proxy = '%s?noconnect' % self.getURL('')
        request = Request('https://example.com', meta={'proxy': http_proxy})
        return self.download_request(request, Spider('foo')).addCallback(_test)
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpErrorMiddleware(Settings({}))
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
Esempio n. 31
0
    def __init__(self, name=None, **kwargs):
        Spider.__init__(self, name)

        self.dbpool = adbapi.ConnectionPool('MySQLdb',
            db = 'driving',
            user = '******',
            passwd = '123456',
            cursorclass = MySQLdb.cursors.DictCursor,
            charset = 'utf8',
            use_unicode = False
        )

        specialCities = [110000, 120000, 310000, 500000];
        cities = json.loads(self.jsonStr)
        for city in cities:
            if city['parent'] or (city['code'] in specialCities):
                self.start_urls.append(''.join(['http://jiaxiao.jiaxiaozhijia.com/',city['pinyin']]))
                self.city_codes[city['pinyin']] = city['code']
Esempio n. 32
0
 def _assert_stores(self, storage, path):
     spider = Spider("default")
     file = storage.open(spider)
     file.write("content")
     yield storage.store(file)
     self.failUnless(os.path.exists(path))
     self.failUnlessEqual(open(path).read(), "content")
     # again, to check s3 objects are overwritten
     yield storage.store(StringIO("new content"))
     self.failUnlessEqual(open(path).read(), "new content")
Esempio n. 33
0
    def setUp(self):
        self.environ = os.environ.copy()
        self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"])

        def _log(x):
            print x

        self.spider.log = _log
        self.response = HtmlResponse(body="<html></html>", url="http://www.example.com/product/8798732")
        self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"})
Esempio n. 34
0
    def test_process_spider_output(self):
        res = Response('http://scrapytest.org')

        short_url_req = Request('http://scrapytest.org/')
        long_url_req = Request('http://scrapytest.org/this_is_a_long_url')
        reqs = [short_url_req, long_url_req]

        mw = UrlLengthMiddleware(maxlength=25)
        spider = Spider('foo')
        out = list(mw.process_spider_output(res, reqs, spider))
        self.assertEquals(out, [short_url_req])
 def test_timeout_download_from_spider(self):
     spider = Spider('foo')
     meta = {'download_timeout': 0.2}
     # client connects but no data is received
     request = Request(self.getURL('wait'), meta=meta)
     d = self.download_request(request, spider)
     yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
     # client connects, server send headers and some body bytes but hangs
     request = Request(self.getURL('hang-after-headers'), meta=meta)
     d = self.download_request(request, spider)
     yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpErrorMiddleware(
            Settings({'HTTPERROR_ALLOWED_CODES': (402, )}))
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
        self.res402 = Response('http://scrapytest.org', status=402)
        self.res402.request = self.req
Esempio n. 37
0
 def _assert_stores(self, storage, path):
     spider = Spider("default")
     file = storage.open(spider)
     file.write(b"content")
     yield storage.store(file)
     self.assertTrue(os.path.exists(path))
     with open(path, 'rb') as fp:
         self.assertEqual(fp.read(), b"content")
     # again, to check s3 objects are overwritten
     yield storage.store(BytesIO(b"new content"))
     with open(path, 'rb') as fp:
         self.assertEqual(fp.read(), b"new content")
Esempio n. 38
0
    def _open_spider(self, request, spider):
        if self.spider:
            return self.spider

        if spider is None:
            spider = create_spider_for_request(self.crawler.spiders,
                                               request,
                                               Spider('default'),
                                               log_multiple=True)
        spider.set_crawler(self.crawler)
        self.crawler.engine.open_spider(spider, close_if_idle=False)
        self.spider = spider
        return spider
Esempio n. 39
0
 def setUp(self):
     self.yesterday = email.utils.formatdate(time.time() - 86400)
     self.today = email.utils.formatdate()
     self.tomorrow = email.utils.formatdate(time.time() + 86400)
     self.crawler = get_crawler()
     self.spider = Spider('example.com')
     self.tmpdir = tempfile.mkdtemp()
     self.request = Request('http://www.example.com',
                            headers={'User-Agent': 'test'})
     self.response = Response('http://www.example.com',
                              headers={'Content-Type': 'text/html'},
                              body='test body',
                              status=202)
     self.crawler.stats.open_spider(self.spider)
Esempio n. 40
0
class ManagerTestCase(TestCase):

    settings_dict = None

    def setUp(self):
        self.crawler = get_crawler(self.settings_dict)
        self.spider = Spider('foo')
        self.spider.set_crawler(self.crawler)
        self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
        # some mw depends on stats collector
        self.crawler.stats.open_spider(self.spider)
        return self.mwman.open_spider(self.spider)

    def tearDown(self):
        self.crawler.stats.close_spider(self.spider, '')
        return self.mwman.close_spider(self.spider)

    def _download(self, request, response=None):
        """Executes downloader mw manager's download method and returns
        the result (Request or Response) or raise exception in case of
        failure.
        """
        if not response:
            response = Response(request.url)

        def download_func(**kwargs):
            return response

        dfd = self.mwman.download(download_func, request, self.spider)
        # catch deferred result and return the value
        results = []
        dfd.addBoth(results.append)
        self._wait(dfd)
        ret = results[0]
        if isinstance(ret, Failure):
            ret.raiseException()
        return ret
Esempio n. 41
0
    def start(self, n_items_per_chunk=None, n_items=None, settings=None):

        # Reset item generator and remove temporary dir
        ItemGenerator.reset()
        self.remove_temp_dir()

        # Setup settings
        settings = settings or self.settings.copy()
        if n_items_per_chunk is not None:
            settings['CHUNKED_FEED_ITEMS_PER_CHUNK'] = n_items_per_chunk

        # Init Scrapy
        self.crawler = get_crawler(settings)
        self.spider = Spider('chunk_test')
        self.spider.set_crawler(self.crawler)
        self.extension = ChunkedFeedExporter.from_crawler(self.crawler)
        self.extension.open_spider(self.spider)

        # Add items if we have to
        if n_items:
            self.add_items(n_items)
Esempio n. 42
0
 def __init__(self):
     Spider.__init__(self)
     self.verificationErrors = []
     self.driver = webdriver.Firefox() 
Esempio n. 43
0
	def __init__(self):
		Spider.__init__(self)
		self.browser = webdriver.Firefox()
		self.cursor.execute('create table if not exists CleaningAgents (cleaningAgentID int primary key, name varchar(20),description varchar(20),instruction varchar(20),application Time long,frequency long,cleaningAgentType varchar(20))')
 def get_request_spider_mw(self):
     crawler = get_crawler()
     spider = Spider("foo")
     spider.set_crawler(crawler)
     request = Request("http://scrapytest.org/")
     return request, spider, DownloadTimeoutMiddleware.from_crawler(crawler)
Esempio n. 45
0
 def __init__(self, config, **kwargs):
     Spider.__init__(self, **kwargs)
     self.config_file = kwargs.get('config_file')
     self.config = FocusedCrawlerConfigure(config, self.config_file).config
Esempio n. 46
0
 def __del__(self):
     self.driver.quit()
     print self.verificationErrors
     Spider.__del__(self)
 def get_spider_and_mw(self, default_useragent):
     crawler = get_crawler({'USER_AGENT': default_useragent})
     spider = Spider('foo')
     spider.set_crawler(crawler)
     return spider, UserAgentMiddleware.from_crawler(crawler)
	def __init__(self):
        	Spider.__init__(self)