def test_link_escaping(): link = Link('/foo/bar#baz.pex') assert link.scheme == 'file' assert link.local assert link.local_path == os.path.realpath('/foo/bar#baz.pex') link = Link('http://www.google.com/%20/%3Afile+%2B2.tar.gz') assert link.filename == ':file++2.tar.gz'
def test_crawler_remote_redirect(): Crawler.reset_cache() mock_context = mock.create_autospec(Context, spec_set=True) mock_context.resolve = lambda link: Link('http://url2.test.com') mock_context.content.side_effect = [MOCK_INDEX_A] expected_output = set([Link('http://url2.test.com/3to2-1.0.tar.gz')]) c = Crawler(mock_context) test_links = [Link('http://url1.test.com')] assert c.crawl(test_links) == expected_output
def test_link_schemes(): link = Link('http://www.google.com') assert link.scheme == 'http' assert link.remote link = Link('https://www.google.com') assert link.scheme == 'https' assert link.remote link = Link('/foo/bar') assert link.scheme == 'file' assert link.local assert link.path == os.path.realpath('/foo/bar')
def test_crawler_remote(): Crawler.reset_cache() mock_context = mock.create_autospec(Context, spec_set=True) mock_context.resolve = lambda link: link mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')] expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'), Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')]) c = Crawler(mock_context) test_links = [Link('http://url1.test.com'), Link('http://url2.test.com')] assert c.crawl(test_links) == expected_output # Test memoization of Crawler.crawl(). assert c.crawl(test_links) == expected_output
def test_urllib_context_utf8_encoding(): BYTES = b'this is a decoded utf8 string' with named_temporary_file() as tf: tf.write(BYTES) tf.flush() local_link = Link.wrap(tf.name) # Trick UrllibContext into thinking this is a remote link class MockUrllibContext(UrllibContext): def open(self, link): return super(MockUrllibContext, self).open(local_link) context = MockUrllibContext() assert context.content(Link.wrap('http://www.google.com')) == BYTES.decode( UrllibContext.DEFAULT_ENCODING)
def test_requests_context(): context = RequestsContext(verify=False) with make_url(BLOB, make_md5(BLOB)) as url: assert context.read(Link.wrap(url)) == BLOB with make_url(BLOB, make_md5(BLOB)) as url: filename = context.fetch(Link.wrap(url)) with open(filename, 'rb') as fp: assert fp.read() == BLOB # test local reading with temporary_file() as tf: tf.write(b'goop') tf.flush() assert context.read(Link.wrap(tf.name)) == b'goop'
def test_requests_context(): context = RequestsContext(verify=False) with make_url(BLOB, make_md5(BLOB)) as url: assert context.read(Link.wrap(url)) == BLOB with make_url(BLOB, make_md5(BLOB)) as url: filename = context.fetch(Link.wrap(url)) with open(filename, 'rb') as fp: assert fp.read() == BLOB # test local reading with named_temporary_file() as tf: tf.write(b'goop') tf.flush() assert context.read(Link.wrap(tf.name)) == b'goop'
def crawl(self, link_or_links, follow_links=False): links = list(Link.wrap_iterable(link_or_links)) cache_key = self._make_cache_key(links, follow_links) # Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE). result = self._CRAWL_CACHE.get(cache_key) if result is None: result = self._crawl(links, follow_links) self._CRAWL_CACHE.store(cache_key, result) return result
def test_requests_context_retries_connect_timeout_retries_exhausted(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3) context = RequestsContext(verify=False, max_retries=2) with pytest.raises(Context.Error): context.read(Link.wrap(url))
def test_requests_context_retries_connect_timeout(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect() context = RequestsContext(verify=False) data = context.read(Link.wrap(url)) assert data == BLOB
def test_requests_context_retries_connect_timeout_retries_exhausted(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3) env = Variables(environ={'PEX_HTTP_RETRIES': '2'}) context = RequestsContext(verify=False, env=env) with pytest.raises(Context.Error): context.read(Link.wrap(url))
def test_requests_context_retries_connect_timeout_retries_exhausted(): with mock.patch.object( requests.packages.urllib3.connectionpool.HTTPConnectionPool, '_make_request') as mock_make_request: url, mock_make_request.side_effect = timeout_side_effect( num_timeouts=3) context = RequestsContext(verify=False, max_retries=2) with pytest.raises(Context.Error): context.read(Link.wrap(url))
def test_crawler_local(): FL = ('a.txt', 'b.txt', 'c.txt') with temporary_dir() as td: for fn in FL: with open(os.path.join(td, fn), 'w'): pass for dn in (1, 2): os.mkdir(os.path.join(td, 'dir%d' % dn)) for fn in FL: with open(os.path.join(td, 'dir%d' % dn, fn), 'w'): pass # basic file / dir rel splitting links, rels = Crawler.crawl_local(Link.wrap(td)) assert set(links) == set(Link.wrap(os.path.join(td, fn)) for fn in FL) assert set(rels) == set(Link.wrap(os.path.join(td, 'dir%d' % n)) for n in (1, 2)) # recursive crawling, single vs multi-threaded for caching in (False, True): for threads in (1, 2, 3): links = Crawler(threads=threads).crawl([td], follow_links=True) expect_links = (set(Link.wrap(os.path.join(td, fn)) for fn in FL) | set(Link.wrap(os.path.join(td, 'dir1', fn)) for fn in FL) | set(Link.wrap(os.path.join(td, 'dir2', fn)) for fn in FL)) assert set(links) == expect_links
def test_crawler_local(): FL = ('a.txt', 'b.txt', 'c.txt') with temporary_dir() as td: for fn in FL: with open(os.path.join(td, fn), 'w'): pass for dn in (1, 2): os.mkdir(os.path.join(td, 'dir%d' % dn)) for fn in FL: with open(os.path.join(td, 'dir%d' % dn, fn), 'w'): pass # basic file / dir rel splitting links, rels = Crawler.crawl_local(Link.wrap(td)) assert set(links) == set(Link.wrap(os.path.join(td, fn)) for fn in FL) assert set(rels) == set( Link.wrap(os.path.join(td, 'dir%d' % n)) for n in (1, 2)) # recursive crawling, single vs multi-threaded for caching in (False, True): for threads in (1, 2, 3): links = Crawler(threads=threads).crawl([td], follow_links=True) expect_links = ( set(Link.wrap(os.path.join(td, fn)) for fn in FL) | set( Link.wrap(os.path.join(td, 'dir1', fn)) for fn in FL) | set(Link.wrap(os.path.join(td, 'dir2', fn)) for fn in FL)) assert set(links) == expect_links
def test_link_wrapping(): link = Link.wrap("https://www.google.com") assert link.url == "https://www.google.com" link = Link.wrap(Link.wrap("https://www.google.com")) assert link.url == "https://www.google.com" with pytest.raises(ValueError): Link.wrap(1234) with pytest.raises(ValueError): Link.wrap_iterable(1234) links = Link.wrap_iterable("https://www.google.com") assert len(links) == 1 assert links[0].url == "https://www.google.com" links = Link.wrap_iterable(["https://www.google.com", Link("http://www.google.com")]) assert set(links) == set([Link("http://www.google.com"), Link("https://www.google.com")])
def test_link_wrapping(): link = Link.wrap('https://www.google.com') assert link.url == 'https://www.google.com' link = Link.wrap(Link.wrap('https://www.google.com')) assert link.url == 'https://www.google.com' with pytest.raises(ValueError): Link.wrap(1234) with pytest.raises(ValueError): Link.wrap_iterable(1234) links = Link.wrap_iterable('https://www.google.com') assert len(links) == 1 assert links[0].url == 'https://www.google.com' links = Link.wrap_iterable(['https://www.google.com', Link('http://www.google.com')]) assert set(links) == set([ Link('http://www.google.com'), Link('https://www.google.com'), ])
def from_href(cls, href, **kw): """Convert from a url to Package. :param href: The url to parse :type href: string :returns: A Package object if a valid concrete implementation exists, otherwise None. """ package = cls._HREF_TO_PACKAGE_CACHE.get(href) if package is not None: return package link_href = Link.wrap(href) for package_type in cls._REGISTRY: try: package = package_type(link_href.url, **kw) break except package_type.InvalidPackage: continue if package is not None: cls._HREF_TO_PACKAGE_CACHE.store(href, package) return package
def test_stream_filelike_without_md5(): with make_url(BLOB) as url: request = requests.get(url) filelike = StreamFilelike(request, Link.wrap(url)) assert filelike.read() == BLOB
def test_stream_filelike_with_incorrect_md5(): with make_url(BLOB, 'f' * 32) as url: request = requests.get(url) filelike = StreamFilelike(request, Link.wrap(url)) with pytest.raises(Context.Error): filelike.read()
def test_link_join(): link = Link('https://www.google.com/bar/') assert link.join('/foo').url == 'https://www.google.com/foo' assert link.join('#foo').url == 'https://www.google.com/bar/#foo' assert link.join('foo').url == 'https://www.google.com/bar/foo'
def test_link_join(): link = Link("https://www.google.com/bar/") assert link.join("/foo").url == "https://www.google.com/foo" assert link.join("#foo").url == "https://www.google.com/bar/#foo" assert link.join("foo").url == "https://www.google.com/bar/foo"
def test_link_wrapping(): link = Link.wrap('https://www.google.com') assert link.url == 'https://www.google.com' link = Link.wrap(Link.wrap('https://www.google.com')) assert link.url == 'https://www.google.com' with pytest.raises(ValueError): Link.wrap(1234) with pytest.raises(ValueError): Link.wrap_iterable(1234) links = Link.wrap_iterable('https://www.google.com') assert len(links) == 1 assert links[0].url == 'https://www.google.com' links = Link.wrap_iterable( ['https://www.google.com', Link('http://www.google.com')]) assert set(links) == set([ Link('http://www.google.com'), Link('https://www.google.com'), ])
def test_link_equality(): assert Link('http://www.google.com') == Link('http://www.google.com') assert Link('http://www.google.com') != Link('http://www.twitter.com')