Esempio n. 1
0
 def test_url_to_filename_with_etags(self):
     for url in ['http://allenai.org', 'http://allennlp.org',
                 'https://www.google.com', 'http://pytorch.org']:
         filename = url_to_filename(url, etag="mytag")
         assert "http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         json.dump({'url': url, 'etag': 'mytag'},
                   open(os.path.join(self.TEST_DIR, filename + '.json'), 'w'))
         back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
         assert back_to_url == url
         assert etag == "mytag"
     baseurl = 'http://allenai.org/'
     assert url_to_filename(baseurl + '1') != url_to_filename(baseurl, etag='1')
Esempio n. 2
0
 def test_url_to_filename_with_etags(self):
     for url in ['http://allenai.org', 'http://allennlp.org',
                 'https://www.google.com', 'http://pytorch.org']:
         filename = url_to_filename(url, etag="mytag")
         assert "http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         json.dump({'url': url, 'etag': 'mytag'},
                   open(os.path.join(self.TEST_DIR, filename + '.json'), 'w'))
         back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
         assert back_to_url == url
         assert etag == "mytag"
     baseurl = 'http://allenai.org/'
     assert url_to_filename(baseurl + '1') != url_to_filename(baseurl, etag='1')
Esempio n. 3
0
    def test_cached_path(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes)

        # non-existent file
        with pytest.raises(FileNotFoundError):
            filename = cached_path(self.FIXTURES_ROOT / "does_not_exist" /
                                   "fake_file.tar.gz")

        # unparsable URI
        with pytest.raises(ValueError):
            filename = cached_path("fakescheme://path/to/fake/file.tar.gz")

        # existing file as path
        assert cached_path(self.glove_file) == str(self.glove_file)

        # caches urls
        filename = cached_path(url, cache_dir=self.TEST_DIR)

        assert len(responses.calls) == 2
        assert filename == os.path.join(self.TEST_DIR,
                                        url_to_filename(url, etag="0"))

        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes
Esempio n. 4
0
    def test_cached_path(self):
        url = "http://fake.datastore.com/glove.txt.gz"
        set_up_glove(url, self.glove_bytes)

        # non-existent file
        with pytest.raises(FileNotFoundError):
            filename = cached_path(self.FIXTURES_ROOT / "does_not_exist" / "fake_file.tar.gz")

        # unparsable URI
        with pytest.raises(ValueError):
            filename = cached_path("fakescheme://path/to/fake/file.tar.gz")

        # existing file as path
        assert cached_path(self.glove_file) == str(self.glove_file)

        # caches urls
        filename = cached_path(url, cache_dir=self.TEST_DIR)

        assert len(responses.calls) == 2
        assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0"))

        with open(filename, "rb") as cached_file:
            assert cached_file.read() == self.glove_bytes

        # archives
        filename = cached_path(
            self.FIXTURES_ROOT / "common" / "quote.tar.gz!quote.txt", extract_archive=True
        )
        with open(filename, "r") as f:
            assert f.read().startswith("I mean, ")
Esempio n. 5
0
 def test_url_to_filename(self):
     for url in [
             "http://allenai.org",
             "http://allennlp.org",
             "https://www.google.com",
             "http://pytorch.org",
             "https://allennlp.s3.amazonaws.com" + "/long" * 20 + "/url",
     ]:
         filename = url_to_filename(url)
         assert "http" not in filename
         with pytest.raises(FileNotFoundError):
             filename_to_url(filename, cache_dir=self.TEST_DIR)
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         with pytest.raises(FileNotFoundError):
             filename_to_url(filename, cache_dir=self.TEST_DIR)
         json.dump(
             {
                 "url": url,
                 "etag": None
             },
             open(os.path.join(self.TEST_DIR, filename + ".json"), "w"),
         )
         back_to_url, etag = filename_to_url(filename,
                                             cache_dir=self.TEST_DIR)
         assert back_to_url == url
         assert etag is None
Esempio n. 6
0
    def test_get_from_cache(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes, change_etag_every=2)

        filename = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename == os.path.join(self.TEST_DIR,
                                        url_to_filename(url, etag="0"))

        # We should have made one HEAD request and one GET request.
        method_counts = Counter(call.request.method
                                for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 1
        assert method_counts['GET'] == 1

        # And the cached file should have the correct contents
        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A second call to `get_from_cache` should make another HEAD call
        # but not another GET call.
        filename2 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename2 == filename

        method_counts = Counter(call.request.method
                                for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 2
        assert method_counts['GET'] == 1

        with open(filename2, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A third call should have a different ETag and should force a new download,
        # which means another HEAD call and another GET call.
        filename3 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename3 == os.path.join(self.TEST_DIR,
                                         url_to_filename(url, etag="1"))

        method_counts = Counter(call.request.method
                                for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 3
        assert method_counts['GET'] == 2

        with open(filename3, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes
Esempio n. 7
0
    def test_cached_path_offline(self, monkeypatch):
        # Ensures `cached_path` just returns the path to the latest cached version
        # of the resource when there's no internet connection.

        # First we mock the `_http_etag` method so that it raises a `ConnectionError`,
        # like it would if there was no internet connection.
        def mocked_http_etag(url: str):
            raise ConnectionError

        monkeypatch.setattr(file_utils, "_http_etag", mocked_http_etag)

        url = "https://github.com/allenai/allennlp/blob/master/some-fake-resource"

        # We'll create two cached versions of this fake resource using two different etags.
        etags = [
            'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea916"',
            'W/"3e5885bfcbf4c47bc4ee9e2f6e5ea918"'
        ]
        filenames = [
            os.path.join(self.TEST_DIR, url_to_filename(url, etag))
            for etag in etags
        ]
        for filename, etag in zip(filenames, etags):
            meta_filename = filename + ".json"
            with open(filename, "w") as f:
                f.write("some random data")
            with open(meta_filename, "w") as meta_f:
                json.dump({"url": url, "etag": etag}, meta_f)
            # os.path.getmtime is only accurate to the second.
            time.sleep(1.1)

        # The version corresponding to the last etag should be returned, since
        # that one has the latest "last modified" time.
        assert get_from_cache(url, cache_dir=self.TEST_DIR) == filenames[-1]

        # We also want to make sure this works when the latest cached version doesn't
        # have a corresponding etag.
        filename = os.path.join(self.TEST_DIR, url_to_filename(url))
        meta_filename = filename + ".json"
        with open(filename, "w") as f:
            f.write("some random data")
        with open(meta_filename, "w") as meta_f:
            json.dump({"url": url, "etag": etag}, meta_f)

        assert get_from_cache(url, cache_dir=self.TEST_DIR) == filename
Esempio n. 8
0
 def test_url_to_filename_with_etags_eliminates_quotes(self):
     for url in ['http://allenai.org', 'http://allennlp.org',
                 'https://www.google.com', 'http://pytorch.org']:
         filename = url_to_filename(url, etag='"mytag"')
         assert "http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         back_to_url, etag = filename_to_url(filename)
         assert back_to_url == url
         assert etag == "mytag"
Esempio n. 9
0
 def test_url_to_filename(self):
     for url in ['http://allenai.org', 'http://allennlp.org',
                 'https://www.google.com', 'http://pytorch.org']:
         filename = url_to_filename(url)
         assert "http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         back_to_url, etag = filename_to_url(filename)
         assert back_to_url == url
         assert etag is None
Esempio n. 10
0
 def test_url_to_filename_with_etags(self):
     for url in [
         "http://allenai.org",
         "http://allennlp.org",
         "https://www.google.com",
         "http://pytorch.org",
     ]:
         filename = url_to_filename(url, etag="mytag")
         assert "http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         json.dump(
             {"url": url, "etag": "mytag"},
             open(os.path.join(self.TEST_DIR, filename + ".json"), "w"),
         )
         back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
         assert back_to_url == url
         assert etag == "mytag"
     baseurl = "http://allenai.org/"
     assert url_to_filename(baseurl + "1") != url_to_filename(baseurl, etag="1")
Esempio n. 11
0
    def test_get_from_cache(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes, change_etag_every=2)

        filename = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0"))

        # We should have made one HEAD request and one GET request.
        method_counts = Counter(call.request.method for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 1
        assert method_counts['GET'] == 1

        # And the cached file should have the correct contents
        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A second call to `get_from_cache` should make another HEAD call
        # but not another GET call.
        filename2 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename2 == filename

        method_counts = Counter(call.request.method for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 2
        assert method_counts['GET'] == 1

        with open(filename2, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes

        # A third call should have a different ETag and should force a new download,
        # which means another HEAD call and another GET call.
        filename3 = get_from_cache(url, cache_dir=self.TEST_DIR)
        assert filename3 == os.path.join(self.TEST_DIR, url_to_filename(url, etag="1"))

        method_counts = Counter(call.request.method for call in responses.calls)
        assert len(method_counts) == 2
        assert method_counts['HEAD'] == 3
        assert method_counts['GET'] == 2

        with open(filename3, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes
Esempio n. 12
0
 def test_url_to_filename_with_etags_eliminates_quotes(self):
     for url in [u'http://allenai.org', u'http://allennlp.org',
                 u'https://www.google.com', u'http://pytorch.org']:
         filename = url_to_filename(url, etag=u'"mytag"')
         assert u"http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         json.dump({u'url': url, u'etag': u'mytag'},
                   open(os.path.join(self.TEST_DIR, filename + u'.json'), u'w'))
         back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
         assert back_to_url == url
         assert etag == u"mytag"
 def test_url_to_filename(self):
     for url in [
             'http://allenai.org', 'http://allennlp.org',
             'https://www.google.com', 'http://pytorch.org'
     ]:
         filename = url_to_filename(url)
         assert "http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         back_to_url, etag = filename_to_url(filename)
         assert back_to_url == url
         assert etag is None
 def test_url_to_filename_with_etags_eliminates_quotes(self):
     for url in [
             'http://allenai.org', 'http://allennlp.org',
             'https://www.google.com', 'http://pytorch.org'
     ]:
         filename = url_to_filename(url, etag='"mytag"')
         assert "http" not in filename
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         back_to_url, etag = filename_to_url(filename)
         assert back_to_url == url
         assert etag == "mytag"
Esempio n. 15
0
 def test_url_to_filename(self):
     for url in ['http://allenai.org', 'http://allennlp.org',
                 'https://www.google.com', 'http://pytorch.org',
                 'https://s3-us-west-2.amazonaws.com/allennlp' + '/long' * 20 + '/url']:
         filename = url_to_filename(url)
         assert "http" not in filename
         with pytest.raises(FileNotFoundError):
             filename_to_url(filename, cache_dir=self.TEST_DIR)
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         with pytest.raises(FileNotFoundError):
             filename_to_url(filename, cache_dir=self.TEST_DIR)
         json.dump({'url': url, 'etag': None},
                   open(os.path.join(self.TEST_DIR, filename + '.json'), 'w'))
         back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
         assert back_to_url == url
         assert etag is None
Esempio n. 16
0
 def test_url_to_filename(self):
     for url in ['http://allenai.org', 'http://allennlp.org',
                 'https://www.google.com', 'http://pytorch.org',
                 'https://s3-us-west-2.amazonaws.com/allennlp' + '/long' * 20 + '/url']:
         filename = url_to_filename(url)
         assert "http" not in filename
         with pytest.raises(FileNotFoundError):
             filename_to_url(filename, cache_dir=self.TEST_DIR)
         pathlib.Path(os.path.join(self.TEST_DIR, filename)).touch()
         with pytest.raises(FileNotFoundError):
             filename_to_url(filename, cache_dir=self.TEST_DIR)
         json.dump({'url': url, 'etag': None},
                   open(os.path.join(self.TEST_DIR, filename + '.json'), 'w'))
         back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
         assert back_to_url == url
         assert etag is None
Esempio n. 17
0
    def test_cached_path(self):
        url = 'http://fake.datastore.com/glove.txt.gz'
        set_up_glove(url, self.glove_bytes)

        # non-existent file
        with pytest.raises(FileNotFoundError):
            filename = cached_path("tests/fixtures/does_not_exist/fake_file.tar.gz")

        # unparsable URI
        with pytest.raises(ValueError):
            filename = cached_path("fakescheme://path/to/fake/file.tar.gz")

        # existing file as path
        assert cached_path(self.glove_file) == self.glove_file

        # caches urls
        filename = cached_path(url, cache_dir=self.TEST_DIR)

        assert len(responses.calls) == 2
        assert filename == os.path.join(self.TEST_DIR, url_to_filename(url, etag="0"))

        with open(filename, 'rb') as cached_file:
            assert cached_file.read() == self.glove_bytes