Ejemplo n.º 1
0
def test_link_escaping():
    link = Link('/foo/bar#baz.pex')
    assert link.scheme == 'file'
    assert link.local
    assert link.local_path == os.path.realpath('/foo/bar#baz.pex')

    link = Link('http://www.google.com/%20/%3Afile+%2B2.tar.gz')
    assert link.filename == ':file++2.tar.gz'
Ejemplo n.º 2
0
def test_crawler_remote_redirect():
    Crawler.reset_cache()

    mock_context = mock.create_autospec(Context, spec_set=True)
    mock_context.resolve = lambda link: Link('http://url2.test.com')
    mock_context.content.side_effect = [MOCK_INDEX_A]
    expected_output = set([Link('http://url2.test.com/3to2-1.0.tar.gz')])

    c = Crawler(mock_context)
    test_links = [Link('http://url1.test.com')]
    assert c.crawl(test_links) == expected_output
Ejemplo n.º 3
0
def test_link_schemes():
    link = Link('http://www.google.com')
    assert link.scheme == 'http'
    assert link.remote

    link = Link('https://www.google.com')
    assert link.scheme == 'https'
    assert link.remote

    link = Link('/foo/bar')
    assert link.scheme == 'file'
    assert link.local
    assert link.path == os.path.realpath('/foo/bar')
Ejemplo n.º 4
0
def test_crawler_remote():
  Crawler.reset_cache()

  mock_context = mock.create_autospec(Context, spec_set=True)
  mock_context.resolve = lambda link: link
  mock_context.content.side_effect = [MOCK_INDEX_A, MOCK_INDEX_B, Exception('shouldnt get here')]
  expected_output = set([Link('http://url1.test.com/3to2-1.0.tar.gz'),
                         Link('http://url2.test.com/APScheduler-2.1.0.tar.gz')])

  c = Crawler(mock_context)
  test_links = [Link('http://url1.test.com'), Link('http://url2.test.com')]
  assert c.crawl(test_links) == expected_output

  # Test memoization of Crawler.crawl().
  assert c.crawl(test_links) == expected_output
Ejemplo n.º 5
0
def test_urllib_context_utf8_encoding():
  BYTES = b'this is a decoded utf8 string'

  with named_temporary_file() as tf:
    tf.write(BYTES)
    tf.flush()
    local_link = Link.wrap(tf.name)

    # Trick UrllibContext into thinking this is a remote link
    class MockUrllibContext(UrllibContext):
      def open(self, link):
        return super(MockUrllibContext, self).open(local_link)

    context = MockUrllibContext()
    assert context.content(Link.wrap('http://www.google.com')) == BYTES.decode(
        UrllibContext.DEFAULT_ENCODING)
Ejemplo n.º 6
0
def test_requests_context():
    context = RequestsContext(verify=False)

    with make_url(BLOB, make_md5(BLOB)) as url:
        assert context.read(Link.wrap(url)) == BLOB

    with make_url(BLOB, make_md5(BLOB)) as url:
        filename = context.fetch(Link.wrap(url))
        with open(filename, 'rb') as fp:
            assert fp.read() == BLOB

    # test local reading
    with temporary_file() as tf:
        tf.write(b'goop')
        tf.flush()
        assert context.read(Link.wrap(tf.name)) == b'goop'
Ejemplo n.º 7
0
def test_urllib_context_utf8_encoding():
  BYTES = b'this is a decoded utf8 string'

  with named_temporary_file() as tf:
    tf.write(BYTES)
    tf.flush()
    local_link = Link.wrap(tf.name)

    # Trick UrllibContext into thinking this is a remote link
    class MockUrllibContext(UrllibContext):
      def open(self, link):
        return super(MockUrllibContext, self).open(local_link)

    context = MockUrllibContext()
    assert context.content(Link.wrap('http://www.google.com')) == BYTES.decode(
        UrllibContext.DEFAULT_ENCODING)
Ejemplo n.º 8
0
def test_requests_context():
  context = RequestsContext(verify=False)

  with make_url(BLOB, make_md5(BLOB)) as url:
    assert context.read(Link.wrap(url)) == BLOB

  with make_url(BLOB, make_md5(BLOB)) as url:
    filename = context.fetch(Link.wrap(url))
    with open(filename, 'rb') as fp:
      assert fp.read() == BLOB

  # test local reading
  with named_temporary_file() as tf:
    tf.write(b'goop')
    tf.flush()
    assert context.read(Link.wrap(tf.name)) == b'goop'
Ejemplo n.º 9
0
  def crawl(self, link_or_links, follow_links=False):
    links = list(Link.wrap_iterable(link_or_links))
    cache_key = self._make_cache_key(links, follow_links)

    # Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE).
    result = self._CRAWL_CACHE.get(cache_key)
    if result is None:
      result = self._crawl(links, follow_links)
      self._CRAWL_CACHE.store(cache_key, result)

    return result
Ejemplo n.º 10
0
def test_requests_context_retries_connect_timeout_retries_exhausted():
  with mock.patch.object(
      requests.packages.urllib3.connectionpool.HTTPConnectionPool,
      '_make_request') as mock_make_request:

    url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3)

    context = RequestsContext(verify=False, max_retries=2)

    with pytest.raises(Context.Error):
      context.read(Link.wrap(url))
Ejemplo n.º 11
0
def test_requests_context_retries_connect_timeout():
    with mock.patch.object(
            requests.packages.urllib3.connectionpool.HTTPConnectionPool,
            '_make_request') as mock_make_request:

        url, mock_make_request.side_effect = timeout_side_effect()

        context = RequestsContext(verify=False)

        data = context.read(Link.wrap(url))
        assert data == BLOB
Ejemplo n.º 12
0
def test_requests_context_retries_connect_timeout():
  with mock.patch.object(
      requests.packages.urllib3.connectionpool.HTTPConnectionPool,
      '_make_request') as mock_make_request:

    url, mock_make_request.side_effect = timeout_side_effect()

    context = RequestsContext(verify=False)

    data = context.read(Link.wrap(url))
    assert data == BLOB
Ejemplo n.º 13
0
def test_requests_context_retries_connect_timeout_retries_exhausted():
  with mock.patch.object(
      requests.packages.urllib3.connectionpool.HTTPConnectionPool,
      '_make_request') as mock_make_request:

    url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3)
    env = Variables(environ={'PEX_HTTP_RETRIES': '2'})

    context = RequestsContext(verify=False, env=env)

    with pytest.raises(Context.Error):
      context.read(Link.wrap(url))
Ejemplo n.º 14
0
def test_requests_context_retries_connect_timeout_retries_exhausted():
  with mock.patch.object(
      requests.packages.urllib3.connectionpool.HTTPConnectionPool,
      '_make_request') as mock_make_request:

    url, mock_make_request.side_effect = timeout_side_effect(num_timeouts=3)
    env = Variables(environ={'PEX_HTTP_RETRIES': '2'})

    context = RequestsContext(verify=False, env=env)

    with pytest.raises(Context.Error):
      context.read(Link.wrap(url))
Ejemplo n.º 15
0
def test_requests_context_retries_connect_timeout_retries_exhausted():
    with mock.patch.object(
            requests.packages.urllib3.connectionpool.HTTPConnectionPool,
            '_make_request') as mock_make_request:

        url, mock_make_request.side_effect = timeout_side_effect(
            num_timeouts=3)

        context = RequestsContext(verify=False, max_retries=2)

        with pytest.raises(Context.Error):
            context.read(Link.wrap(url))
Ejemplo n.º 16
0
def test_crawler_local():
  FL = ('a.txt', 'b.txt', 'c.txt')
  with temporary_dir() as td:
    for fn in FL:
      with open(os.path.join(td, fn), 'w'):
        pass
    for dn in (1, 2):
      os.mkdir(os.path.join(td, 'dir%d' % dn))
      for fn in FL:
        with open(os.path.join(td, 'dir%d' % dn, fn), 'w'):
          pass

    # basic file / dir rel splitting
    links, rels = Crawler.crawl_local(Link.wrap(td))
    assert set(links) == set(Link.wrap(os.path.join(td, fn)) for fn in FL)
    assert set(rels) == set(Link.wrap(os.path.join(td, 'dir%d' % n)) for n in (1, 2))

    # recursive crawling, single vs multi-threaded
    for caching in (False, True):
      for threads in (1, 2, 3):
        links = Crawler(threads=threads).crawl([td], follow_links=True)
        expect_links = (set(Link.wrap(os.path.join(td, fn)) for fn in FL) |
                        set(Link.wrap(os.path.join(td, 'dir1', fn)) for fn in FL) |
                        set(Link.wrap(os.path.join(td, 'dir2', fn)) for fn in FL))
        assert set(links) == expect_links
Ejemplo n.º 17
0
def test_crawler_local():
    FL = ('a.txt', 'b.txt', 'c.txt')
    with temporary_dir() as td:
        for fn in FL:
            with open(os.path.join(td, fn), 'w'):
                pass
        for dn in (1, 2):
            os.mkdir(os.path.join(td, 'dir%d' % dn))
            for fn in FL:
                with open(os.path.join(td, 'dir%d' % dn, fn), 'w'):
                    pass

        # basic file / dir rel splitting
        links, rels = Crawler.crawl_local(Link.wrap(td))
        assert set(links) == set(Link.wrap(os.path.join(td, fn)) for fn in FL)
        assert set(rels) == set(
            Link.wrap(os.path.join(td, 'dir%d' % n)) for n in (1, 2))

        # recursive crawling, single vs multi-threaded
        for caching in (False, True):
            for threads in (1, 2, 3):
                links = Crawler(threads=threads).crawl([td], follow_links=True)
                expect_links = (
                    set(Link.wrap(os.path.join(td, fn)) for fn in FL) | set(
                        Link.wrap(os.path.join(td, 'dir1', fn)) for fn in FL) |
                    set(Link.wrap(os.path.join(td, 'dir2', fn)) for fn in FL))
                assert set(links) == expect_links
Ejemplo n.º 18
0
def test_link_wrapping():
    link = Link.wrap("https://www.google.com")
    assert link.url == "https://www.google.com"

    link = Link.wrap(Link.wrap("https://www.google.com"))
    assert link.url == "https://www.google.com"

    with pytest.raises(ValueError):
        Link.wrap(1234)

    with pytest.raises(ValueError):
        Link.wrap_iterable(1234)

    links = Link.wrap_iterable("https://www.google.com")
    assert len(links) == 1
    assert links[0].url == "https://www.google.com"

    links = Link.wrap_iterable(["https://www.google.com", Link("http://www.google.com")])
    assert set(links) == set([Link("http://www.google.com"), Link("https://www.google.com")])
Ejemplo n.º 19
0
def test_link_wrapping():
  link = Link.wrap('https://www.google.com')
  assert link.url == 'https://www.google.com'

  link = Link.wrap(Link.wrap('https://www.google.com'))
  assert link.url == 'https://www.google.com'

  with pytest.raises(ValueError):
    Link.wrap(1234)

  with pytest.raises(ValueError):
    Link.wrap_iterable(1234)

  links = Link.wrap_iterable('https://www.google.com')
  assert len(links) == 1
  assert links[0].url == 'https://www.google.com'

  links = Link.wrap_iterable(['https://www.google.com', Link('http://www.google.com')])
  assert set(links) == set([
      Link('http://www.google.com'),
      Link('https://www.google.com'),
  ])
Ejemplo n.º 20
0
  def from_href(cls, href, **kw):
    """Convert from a url to Package.

    :param href: The url to parse
    :type href: string
    :returns: A Package object if a valid concrete implementation exists, otherwise None.
    """
    package = cls._HREF_TO_PACKAGE_CACHE.get(href)
    if package is not None:
      return package
    link_href = Link.wrap(href)
    for package_type in cls._REGISTRY:
      try:
        package = package_type(link_href.url, **kw)
        break
      except package_type.InvalidPackage:
        continue
    if package is not None:
      cls._HREF_TO_PACKAGE_CACHE.store(href, package)
    return package
Ejemplo n.º 21
0
def test_stream_filelike_without_md5():
  with make_url(BLOB) as url:
    request = requests.get(url)
    filelike = StreamFilelike(request, Link.wrap(url))
    assert filelike.read() == BLOB
Ejemplo n.º 22
0
def test_stream_filelike_with_incorrect_md5():
  with make_url(BLOB, 'f' * 32) as url:
    request = requests.get(url)
    filelike = StreamFilelike(request, Link.wrap(url))
    with pytest.raises(Context.Error):
      filelike.read()
Ejemplo n.º 23
0
def test_link_join():
  link = Link('https://www.google.com/bar/')
  assert link.join('/foo').url == 'https://www.google.com/foo'
  assert link.join('#foo').url == 'https://www.google.com/bar/#foo'
  assert link.join('foo').url == 'https://www.google.com/bar/foo'
Ejemplo n.º 24
0
def test_link_join():
    link = Link("https://www.google.com/bar/")
    assert link.join("/foo").url == "https://www.google.com/foo"
    assert link.join("#foo").url == "https://www.google.com/bar/#foo"
    assert link.join("foo").url == "https://www.google.com/bar/foo"
Ejemplo n.º 25
0
def test_link_wrapping():
    link = Link.wrap('https://www.google.com')
    assert link.url == 'https://www.google.com'

    link = Link.wrap(Link.wrap('https://www.google.com'))
    assert link.url == 'https://www.google.com'

    with pytest.raises(ValueError):
        Link.wrap(1234)

    with pytest.raises(ValueError):
        Link.wrap_iterable(1234)

    links = Link.wrap_iterable('https://www.google.com')
    assert len(links) == 1
    assert links[0].url == 'https://www.google.com'

    links = Link.wrap_iterable(
        ['https://www.google.com',
         Link('http://www.google.com')])
    assert set(links) == set([
        Link('http://www.google.com'),
        Link('https://www.google.com'),
    ])
Ejemplo n.º 26
0
def test_link_join():
    link = Link('https://www.google.com/bar/')
    assert link.join('/foo').url == 'https://www.google.com/foo'
    assert link.join('#foo').url == 'https://www.google.com/bar/#foo'
    assert link.join('foo').url == 'https://www.google.com/bar/foo'
Ejemplo n.º 27
0
def test_stream_filelike_with_incorrect_md5():
    with make_url(BLOB, 'f' * 32) as url:
        request = requests.get(url)
        filelike = StreamFilelike(request, Link.wrap(url))
        with pytest.raises(Context.Error):
            filelike.read()
Ejemplo n.º 28
0
def test_link_equality():
    assert Link('http://www.google.com') == Link('http://www.google.com')
    assert Link('http://www.google.com') != Link('http://www.twitter.com')
Ejemplo n.º 29
0
def test_stream_filelike_without_md5():
    with make_url(BLOB) as url:
        request = requests.get(url)
        filelike = StreamFilelike(request, Link.wrap(url))
        assert filelike.read() == BLOB